diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5453,7 +5453,7 @@ EVT ShSVT = ShVT.getScalarType(); // If MUL is unavailable, we cannot proceed in any case. - if (!isOperationLegalOrCustom(ISD::MUL, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::MUL, VT)) return SDValue(); bool ComparingWithAllZeros = true; @@ -5583,7 +5583,7 @@ } if (!ComparingWithAllZeros && !AllComparisonsWithNonZerosAreTautological) { - if (!isOperationLegalOrCustom(ISD::SUB, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::SUB, VT)) return SDValue(); // FIXME: Could/should use `ISD::ADD`? assert(CompTargetNode.getValueType() == N.getValueType() && "Expecting that the types on LHS and RHS of comparisons match."); @@ -5598,7 +5598,7 @@ // divisors as a performance improvement, since rotating by 0 is a no-op. if (HadEvenDivisor) { // We need ROTR to do this. - if (!isOperationLegalOrCustom(ISD::ROTR, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ROTR, VT)) return SDValue(); SDNodeFlags Flags; Flags.setExact(true); @@ -5628,6 +5628,8 @@ DAG.getSetCC(DL, SETCCVT, D, CompTargetNode, ISD::SETULE); Created.push_back(TautologicalInvertedChannels.getNode()); + // NOTE: we avoid letting illegal types through even if we're before legalize + // ops – legalization has a hard time producing good code for this. if (isOperationLegalOrCustom(ISD::VSELECT, SETCCVT)) { // If we have a vector select, let's replace the comparison results in the // affected lanes with the correct tautological result. @@ -5638,6 +5640,8 @@ } // Else, we can just invert the comparison result in the appropriate lanes. + // + // NOTE: see the note above VSELECT above. if (isOperationLegalOrCustom(ISD::XOR, SETCCVT)) return DAG.getNode(ISD::XOR, DL, SETCCVT, NewCC, TautologicalInvertedChannels); @@ -5692,8 +5696,9 @@ EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout()); EVT ShSVT = ShVT.getScalarType(); - // If MUL is unavailable, we cannot proceed in any case. - if (!isOperationLegalOrCustom(ISD::MUL, VT)) + // If we are after ops legalization, and MUL is unavailable, we can not + // proceed. + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::MUL, VT)) return SDValue(); // TODO: Could support comparing with non-zero too. @@ -5848,7 +5853,7 @@ if (NeedToApplyOffset) { // We need ADD to do this. - if (!isOperationLegalOrCustom(ISD::ADD, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ADD, VT)) return SDValue(); // (add (mul N, P), A) @@ -5860,7 +5865,7 @@ // divisors as a performance improvement, since rotating by 0 is a no-op. if (HadEvenDivisor) { // We need ROTR to do this. - if (!isOperationLegalOrCustom(ISD::ROTR, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ROTR, VT)) return SDValue(); SDNodeFlags Flags; Flags.setExact(true); @@ -5883,6 +5888,9 @@ // we must fix-up results for said lanes. assert(VT.isVector() && "Can/should only get here for vectors."); + // NOTE: we avoid letting illegal types through even if we're before legalize + // ops – legalization has a hard time producing good code for the code that + // follows. if (!isOperationLegalOrCustom(ISD::SETEQ, VT) || !isOperationLegalOrCustom(ISD::AND, VT) || !isOperationLegalOrCustom(Cond, VT) || diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-illegal-types.ll @@ -4,14 +4,14 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; CHECK-LABEL: test_srem_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #33099 -; CHECK-NEXT: mov w10, #64874 -; CHECK-NEXT: sbfx w8, w0, #0, #29 -; CHECK-NEXT: movk w9, #48986, lsl #16 -; CHECK-NEXT: movk w10, #330, lsl #16 -; CHECK-NEXT: madd w8, w8, w9, w10 -; CHECK-NEXT: mov w9, #64213 -; CHECK-NEXT: movk w9, #661, lsl #16 +; CHECK-NEXT: mov w8, #33099 +; CHECK-NEXT: mov w9, #24493 +; CHECK-NEXT: movk w8, #8026, lsl #16 +; CHECK-NEXT: movk w9, #41, lsl #16 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #48987 +; CHECK-NEXT: and w8, w8, #0x1fffffff +; CHECK-NEXT: movk w9, #82, lsl #16 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -9,20 +9,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: adrp x8, .LCPI0_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] -; CHECK-NEXT: adrp x8, .LCPI0_2 -; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] ; CHECK-NEXT: adrp x8, .LCPI0_3 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: adrp x8, .LCPI0_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: usra v3.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -82,27 +80,19 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: adrp x8, .LCPI3_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: adrp x8, .LCPI3_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_3] -; CHECK-NEXT: adrp x8, .LCPI3_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_4] -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: movk w8, #46811, lsl #16 +; CHECK-NEXT: movk w9, #4681, lsl #16 +; CHECK-NEXT: adrp x10, .LCPI3_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI3_0] +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v0.4s, v2.4s, #31 +; CHECK-NEXT: ushr v1.4s, v2.4s, #1 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -114,29 +104,21 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: adrp x8, .LCPI4_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] -; CHECK-NEXT: adrp x8, .LCPI4_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] -; CHECK-NEXT: adrp x8, .LCPI4_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI4_3] -; CHECK-NEXT: adrp x8, .LCPI4_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_4] -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: movk w8, #46811, lsl #16 +; CHECK-NEXT: movk w9, #4681, lsl #16 +; CHECK-NEXT: adrp x10, .LCPI4_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI4_0] +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v0.4s, v2.4s, #31 +; CHECK-NEXT: ushr v1.4s, v2.4s, #1 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhi v0.4s, v0.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, @@ -152,23 +134,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] ; CHECK-NEXT: adrp x8, .LCPI5_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] -; CHECK-NEXT: adrp x8, .LCPI5_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] ; CHECK-NEXT: adrp x8, .LCPI5_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: adrp x8, .LCPI5_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI5_2] ; CHECK-NEXT: adrp x8, .LCPI5_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -184,25 +161,20 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] ; CHECK-NEXT: adrp x8, .LCPI6_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] -; CHECK-NEXT: adrp x8, .LCPI6_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2] ; CHECK-NEXT: adrp x8, .LCPI6_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI6_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_3] +; CHECK-NEXT: adrp x8, .LCPI6_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_2] ; CHECK-NEXT: adrp x8, .LCPI6_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, @@ -220,20 +192,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] ; CHECK-NEXT: adrp x8, .LCPI7_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1] -; CHECK-NEXT: adrp x8, .LCPI7_2 -; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2] ; CHECK-NEXT: adrp x8, .LCPI7_3 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_3] +; CHECK-NEXT: adrp x8, .LCPI7_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI7_2] +; CHECK-NEXT: adrp x8, .LCPI7_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: usra v3.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -251,14 +221,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] ; CHECK-NEXT: adrp x8, .LCPI8_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1] -; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v1.4s, v1.4s, v0.4s -; CHECK-NEXT: sshr v3.4s, v1.4s, #3 -; CHECK-NEXT: usra v3.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: adrp x8, .LCPI8_3 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI8_3] +; CHECK-NEXT: adrp x8, .LCPI8_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_2] +; CHECK-NEXT: adrp x8, .LCPI8_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -276,20 +250,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] ; CHECK-NEXT: adrp x8, .LCPI9_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] -; CHECK-NEXT: adrp x8, .LCPI9_2 -; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] ; CHECK-NEXT: adrp x8, .LCPI9_3 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_3] +; CHECK-NEXT: adrp x8, .LCPI9_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI9_2] +; CHECK-NEXT: adrp x8, .LCPI9_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: usra v3.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -328,25 +300,19 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_one: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: adrp x8, .LCPI11_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1] -; CHECK-NEXT: adrp x8, .LCPI11_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2] -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: adrp x8, .LCPI11_3 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: add v1.4s, v1.4s, v0.4s -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: movk w8, #46811, lsl #16 +; CHECK-NEXT: movk w9, #4681, lsl #16 +; CHECK-NEXT: adrp x10, .LCPI11_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI11_0] +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v0.4s, v2.4s, #31 +; CHECK-NEXT: ushr v1.4s, v2.4s, #1 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -364,24 +330,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] ; CHECK-NEXT: adrp x8, .LCPI12_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] -; CHECK-NEXT: adrp x8, .LCPI12_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] ; CHECK-NEXT: adrp x8, .LCPI12_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: adrp x8, .LCPI12_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI12_2] ; CHECK-NEXT: adrp x8, .LCPI12_4 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -493,23 +453,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] ; CHECK-NEXT: adrp x8, .LCPI16_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] -; CHECK-NEXT: adrp x8, .LCPI16_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] ; CHECK-NEXT: adrp x8, .LCPI16_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_3] +; CHECK-NEXT: adrp x8, .LCPI16_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_2] ; CHECK-NEXT: adrp x8, .LCPI16_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -527,23 +482,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] ; CHECK-NEXT: adrp x8, .LCPI17_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1] -; CHECK-NEXT: adrp x8, .LCPI17_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2] ; CHECK-NEXT: adrp x8, .LCPI17_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI17_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_3] +; CHECK-NEXT: adrp x8, .LCPI17_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_2] ; CHECK-NEXT: adrp x8, .LCPI17_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -561,23 +511,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] ; CHECK-NEXT: adrp x8, .LCPI18_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] -; CHECK-NEXT: adrp x8, .LCPI18_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] ; CHECK-NEXT: adrp x8, .LCPI18_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: adrp x8, .LCPI18_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI18_2] ; CHECK-NEXT: adrp x8, .LCPI18_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -616,27 +561,19 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_and_one: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI20_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: adrp x8, .LCPI20_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1] -; CHECK-NEXT: adrp x8, .LCPI20_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2] -; CHECK-NEXT: adrp x8, .LCPI20_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_3] -; CHECK-NEXT: adrp x8, .LCPI20_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_4] -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: movk w8, #46811, lsl #16 +; CHECK-NEXT: movk w9, #4681, lsl #16 +; CHECK-NEXT: adrp x10, .LCPI20_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI20_0] +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v0.4s, v2.4s, #31 +; CHECK-NEXT: ushr v1.4s, v2.4s, #1 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -654,23 +591,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] ; CHECK-NEXT: adrp x8, .LCPI21_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] -; CHECK-NEXT: adrp x8, .LCPI21_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2] ; CHECK-NEXT: adrp x8, .LCPI21_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_3] +; CHECK-NEXT: adrp x8, .LCPI21_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI21_2] ; CHECK-NEXT: adrp x8, .LCPI21_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -690,24 +622,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] ; CHECK-NEXT: adrp x8, .LCPI22_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1] -; CHECK-NEXT: adrp x8, .LCPI22_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2] ; CHECK-NEXT: adrp x8, .LCPI22_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: adrp x8, .LCPI22_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI22_2] ; CHECK-NEXT: adrp x8, .LCPI22_4 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -725,21 +651,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] ; CHECK-NEXT: adrp x8, .LCPI23_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1] -; CHECK-NEXT: adrp x8, .LCPI23_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2] -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: adrp x8, .LCPI23_3 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: add v1.4s, v1.4s, v0.4s -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_3] +; CHECK-NEXT: adrp x8, .LCPI23_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI23_2] +; CHECK-NEXT: adrp x8, .LCPI23_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -757,24 +680,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] ; CHECK-NEXT: adrp x8, .LCPI24_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] -; CHECK-NEXT: adrp x8, .LCPI24_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2] ; CHECK-NEXT: adrp x8, .LCPI24_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: adrp x8, .LCPI24_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI24_2] ; CHECK-NEXT: adrp x8, .LCPI24_4 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -793,22 +710,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] ; CHECK-NEXT: adrp x8, .LCPI25_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_1] -; CHECK-NEXT: adrp x8, .LCPI25_2 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI25_2] ; CHECK-NEXT: adrp x8, .LCPI25_3 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_3] -; CHECK-NEXT: neg v4.4s, v4.4s -; CHECK-NEXT: movi v3.2d, #0x000000ffffffff -; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_3] +; CHECK-NEXT: adrp x8, .LCPI25_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI25_2] +; CHECK-NEXT: adrp x8, .LCPI25_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -825,22 +738,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] ; CHECK-NEXT: adrp x8, .LCPI26_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] -; CHECK-NEXT: adrp x8, .LCPI26_2 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI26_2] ; CHECK-NEXT: adrp x8, .LCPI26_3 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_3] -; CHECK-NEXT: neg v4.4s, v4.4s -; CHECK-NEXT: movi v3.2d, #0x000000ffffffff -; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_3] +; CHECK-NEXT: adrp x8, .LCPI26_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI26_2] +; CHECK-NEXT: adrp x8, .LCPI26_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -29,17 +29,20 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 -; CHECK-NEXT: movk w8, #20971, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s -; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: sshr v3.4s, v2.4s, #5 -; CHECK-NEXT: movi v1.4s, #100 -; CHECK-NEXT: usra v3.4s, v2.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: mov w9, #47184 +; CHECK-NEXT: movk w8, #49807, lsl #16 +; CHECK-NEXT: movk w9, #1310, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mov w10, #23592 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: movk w10, #655, lsl #16 +; CHECK-NEXT: shl v0.4s, v2.4s, #30 +; CHECK-NEXT: ushr v1.4s, v2.4s, #2 +; CHECK-NEXT: dup v3.4s, w10 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -79,17 +82,20 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_neg100: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: sshr v3.4s, v1.4s, #5 -; CHECK-NEXT: usra v3.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: mov w9, #47184 +; CHECK-NEXT: movk w8, #49807, lsl #16 +; CHECK-NEXT: movk w9, #1310, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mov w10, #23592 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: movk w10, #655, lsl #16 +; CHECK-NEXT: shl v0.4s, v2.4s, #30 +; CHECK-NEXT: ushr v1.4s, v2.4s, #2 +; CHECK-NEXT: dup v3.4s, w10 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -83,15 +83,13 @@ define i16 @test_srem_even(i16 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: mov w9, #18725 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: asr w9, w8, #18 -; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #14 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: tst w8, #0xffff -; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w9, #4680 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: lsl w10, w8, #15 +; CHECK-NEXT: bfxil w10, w8, #1, #15 +; CHECK-NEXT: cmp w9, w10, uxth +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %srem = srem i16 %X, 14 %cmp = icmp ne i16 %srem, 0 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-illegal-types.ll @@ -4,13 +4,10 @@ define i1 @test_urem_odd(i13 %X) nounwind { ; CHECK-LABEL: test_urem_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #52429 -; CHECK-NEXT: and w8, w0, #0x1fff -; CHECK-NEXT: movk w9, #52428, lsl #16 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: mov w9, #13108 -; CHECK-NEXT: movk w9, #13107, lsl #16 -; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: mov w8, #3277 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: and w8, w8, #0x1fff +; CHECK-NEXT: cmp w8, #1639 // =1639 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i13 %X, 5 @@ -21,13 +18,14 @@ define i1 @test_urem_even(i27 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #28087 -; CHECK-NEXT: and w8, w0, #0x7ffffff -; CHECK-NEXT: movk w9, #46811, lsl #16 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: mov w9, #9363 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: movk w9, #4681, lsl #16 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: movk w8, #1755, lsl #16 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: lsl w9, w8, #26 +; CHECK-NEXT: bfxil w9, w8, #1, #26 +; CHECK-NEXT: and w8, w9, #0x7ffffff +; CHECK-NEXT: mov w9, #18725 +; CHECK-NEXT: movk w9, #146, lsl #16 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -39,12 +37,10 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; CHECK-LABEL: test_urem_odd_setne: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #52429 -; CHECK-NEXT: and w8, w0, #0xf -; CHECK-NEXT: movk w9, #52428, lsl #16 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: mov w9, #858993459 -; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: mov w8, #13 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: cmp w8, #3 // =3 ; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret %urem = urem i4 %X, 5 @@ -55,13 +51,10 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { ; CHECK-LABEL: test_urem_negative_odd: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #57651 -; CHECK-NEXT: and w8, w0, #0x1ff -; CHECK-NEXT: movk w9, #43302, lsl #16 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: mov w9, #17191 -; CHECK-NEXT: movk w9, #129, lsl #16 -; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: mov w8, #307 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: and w8, w8, #0x1ff +; CHECK-NEXT: cmp w8, #1 // =1 ; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret %urem = urem i9 %X, -5 @@ -72,41 +65,29 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; CHECK-LABEL: test_urem_vec: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w12, #43691 -; CHECK-NEXT: and w8, w0, #0x7ff -; CHECK-NEXT: movk w12, #43690, lsl #16 -; CHECK-NEXT: umull x12, w8, w12 -; CHECK-NEXT: mov w11, #25663 -; CHECK-NEXT: mov w13, #6 -; CHECK-NEXT: lsr x12, x12, #34 -; CHECK-NEXT: and w10, w2, #0x7ff -; CHECK-NEXT: movk w11, #160, lsl #16 -; CHECK-NEXT: msub w8, w12, w13, w8 -; CHECK-NEXT: mov w12, #18725 -; CHECK-NEXT: and w9, w1, #0x7ff -; CHECK-NEXT: movk w12, #9362, lsl #16 -; CHECK-NEXT: umull x11, w10, w11 -; CHECK-NEXT: adrp x13, .LCPI4_0 -; CHECK-NEXT: umull x12, w9, w12 -; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: ldr d0, [x13, :lo12:.LCPI4_0] -; CHECK-NEXT: lsr x12, x12, #32 -; CHECK-NEXT: sub w13, w10, w11 -; CHECK-NEXT: add w11, w11, w13, lsr #1 -; CHECK-NEXT: sub w13, w9, w12 -; CHECK-NEXT: add w12, w12, w13, lsr #1 -; CHECK-NEXT: fmov s1, w8 -; CHECK-NEXT: mov w8, #2043 -; CHECK-NEXT: lsr w11, w11, #10 -; CHECK-NEXT: lsr w12, w12, #2 -; CHECK-NEXT: msub w8, w11, w8, w10 -; CHECK-NEXT: sub w10, w12, w12, lsl #3 -; CHECK-NEXT: add w9, w9, w10 -; CHECK-NEXT: mov v1.h[1], w9 -; CHECK-NEXT: mov v1.h[2], w8 -; CHECK-NEXT: bic v1.4h, #248, lsl #8 -; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h -; CHECK-NEXT: mvn v0.8b, v0.8b +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: adrp x9, .LCPI4_1 +; CHECK-NEXT: mov v0.h[1], w1 +; CHECK-NEXT: ldr d2, [x9, :lo12:.LCPI4_1] +; CHECK-NEXT: adrp x8, .LCPI4_2 +; CHECK-NEXT: adrp x9, .LCPI4_3 +; CHECK-NEXT: mov v0.h[2], w2 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_2] +; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ldr d1, [x9, :lo12:.LCPI4_3] +; CHECK-NEXT: mul v0.4h, v0.4h, v2.4h +; CHECK-NEXT: adrp x8, .LCPI4_4 +; CHECK-NEXT: shl v2.4h, v0.4h, #1 +; CHECK-NEXT: ushl v2.4h, v2.4h, v3.4h +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_4] +; CHECK-NEXT: neg v1.4h, v1.4h +; CHECK-NEXT: bic v0.4h, #248, lsl #8 +; CHECK-NEXT: ushl v0.4h, v0.4h, v1.4h +; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b +; CHECK-NEXT: bic v0.4h, #248, lsl #8 +; CHECK-NEXT: cmhi v0.4h, v0.4h, v3.4h ; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: umov w1, v0.h[1] ; CHECK-NEXT: umov w2, v0.h[2] diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll @@ -195,15 +195,12 @@ define i1 @t16_3_2(i16 %X) nounwind { ; CHECK-LABEL: t16_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: lsr w8, w8, #17 -; CHECK-NEXT: add w8, w8, w8, lsl #1 -; CHECK-NEXT: sub w8, w0, w8 -; CHECK-NEXT: and w8, w8, #0xffff -; CHECK-NEXT: cmp w8, #2 // =2 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: mov w8, #-21845 +; CHECK-NEXT: mov w9, #-21846 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #21845 +; CHECK-NEXT: cmp w9, w8, uxth +; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret %urem = urem i16 %X, 3 %cmp = icmp eq i16 %urem, 2 @@ -213,15 +210,12 @@ define i1 @t8_3_2(i8 %X) nounwind { ; CHECK-LABEL: t8_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: mov w9, #171 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: lsr w8, w8, #9 -; CHECK-NEXT: add w8, w8, w8, lsl #1 -; CHECK-NEXT: sub w8, w0, w8 +; CHECK-NEXT: mov w8, #-85 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: sub w8, w8, #86 // =86 ; CHECK-NEXT: and w8, w8, #0xff -; CHECK-NEXT: cmp w8, #2 // =2 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: cmp w8, #85 // =85 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i8 %X, 3 %cmp = icmp eq i8 %urem, 2 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -11,17 +11,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] ; CHECK-NEXT: adrp x8, .LCPI0_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI0_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -79,17 +76,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] ; CHECK-NEXT: adrp x8, .LCPI3_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI3_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -107,19 +101,16 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] ; CHECK-NEXT: adrp x8, .LCPI4_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI4_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -137,17 +128,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] ; CHECK-NEXT: adrp x8, .LCPI5_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI5_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -165,19 +153,16 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] ; CHECK-NEXT: adrp x8, .LCPI6_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI6_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -197,13 +182,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1] ; CHECK-NEXT: adrp x8, .LCPI7_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: adrp x8, .LCPI7_3 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -223,17 +209,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1] ; CHECK-NEXT: adrp x8, .LCPI8_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI8_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI8_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -253,17 +236,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] ; CHECK-NEXT: adrp x8, .LCPI9_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI9_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -299,26 +279,16 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_one: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: adrp x8, .LCPI11_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1] -; CHECK-NEXT: adrp x8, .LCPI11_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2] -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: adrp x8, .LCPI11_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_3] -; CHECK-NEXT: adrp x8, .LCPI11_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_4] -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: movk w8, #46811, lsl #16 +; CHECK-NEXT: adrp x9, .LCPI11_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI11_0] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v1.4s, v0.4s, #31 +; CHECK-NEXT: ushr v0.4s, v0.4s, #1 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -338,20 +308,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] ; CHECK-NEXT: adrp x8, .LCPI12_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI12_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_3] -; CHECK-NEXT: adrp x8, .LCPI12_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_4] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -373,13 +337,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1] ; CHECK-NEXT: adrp x8, .LCPI13_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: adrp x8, .LCPI13_3 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -399,17 +364,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: adrp x8, .LCPI14_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI14_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -429,17 +391,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: adrp x8, .LCPI15_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI15_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -461,13 +420,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] ; CHECK-NEXT: adrp x8, .LCPI16_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: adrp x8, .LCPI16_3 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -487,17 +447,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1] ; CHECK-NEXT: adrp x8, .LCPI17_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI17_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -517,13 +474,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] ; CHECK-NEXT: adrp x8, .LCPI18_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: adrp x8, .LCPI18_3 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -564,20 +522,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1] ; CHECK-NEXT: adrp x8, .LCPI20_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI20_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_3] -; CHECK-NEXT: adrp x8, .LCPI20_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_4] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -598,15 +550,13 @@ ; CHECK-NEXT: adrp x8, .LCPI21_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2] ; CHECK-NEXT: adrp x8, .LCPI21_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -629,15 +579,13 @@ ; CHECK-NEXT: adrp x8, .LCPI22_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2] ; CHECK-NEXT: adrp x8, .LCPI22_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -657,20 +605,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1] ; CHECK-NEXT: adrp x8, .LCPI23_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI23_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_3] -; CHECK-NEXT: adrp x8, .LCPI23_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_4] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -691,15 +633,13 @@ ; CHECK-NEXT: adrp x8, .LCPI24_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2] ; CHECK-NEXT: adrp x8, .LCPI24_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -721,15 +661,13 @@ ; CHECK-NEXT: adrp x8, .LCPI25_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_2] ; CHECK-NEXT: adrp x8, .LCPI25_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI25_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -748,20 +686,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] ; CHECK-NEXT: adrp x8, .LCPI26_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI26_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_3] -; CHECK-NEXT: adrp x8, .LCPI26_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI26_4] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -45,18 +45,20 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-LABEL: t32_6_part0: ; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: mov w8, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: adrp x9, .LCPI2_0 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_0] -; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #2 -; CHECK-NEXT: movi v3.4s, #6 -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov w9, #43690 +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: shl v1.4s, v0.4s, #31 +; CHECK-NEXT: ushr v0.4s, v0.4s, #1 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, w9 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -67,18 +69,19 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind { ; CHECK-LABEL: t32_6_part1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: adrp x9, .LCPI3_0 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_0] -; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #2 -; CHECK-NEXT: movi v3.4s, #6 -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: mov w9, #43691 +; CHECK-NEXT: movk w9, #43690, lsl #16 +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: shl v1.4s, v0.4s, #31 +; CHECK-NEXT: ushr v0.4s, v0.4s, #1 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -92,22 +95,16 @@ ; CHECK-NEXT: adrp x8, .LCPI4_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] ; CHECK-NEXT: adrp x8, .LCPI4_1 +; CHECK-NEXT: mov w9, #43691 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] -; CHECK-NEXT: adrp x8, .LCPI4_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] -; CHECK-NEXT: adrp x8, .LCPI4_3 -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI4_3] -; CHECK-NEXT: adrp x8, .LCPI4_4 -; CHECK-NEXT: umull2 v5.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v5.4s -; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI4_4] -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s +; CHECK-NEXT: movk w9, #43690, lsl #16 +; CHECK-NEXT: dup v3.4s, w9 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v3.4s +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: movi d1, #0x00ffffffff0000 +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll @@ -26,16 +26,17 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 -; CHECK-NEXT: movk w8, #20971, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s -; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: movi v1.4s, #100 -; CHECK-NEXT: ushr v2.4s, v2.4s, #5 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: movk w8, #49807, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: mov w9, #23592 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: movk w9, #655, lsl #16 +; CHECK-NEXT: shl v1.4s, v0.4s, #30 +; CHECK-NEXT: ushr v0.4s, v0.4s, #2 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -74,19 +75,11 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: adrp x8, .LCPI3_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: adrp x8, .LCPI3_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: adrp x8, .LCPI3_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v1.4s, v0.4s, #30 +; CHECK-NEXT: ushr v0.4s, v0.4s, #2 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -78,14 +78,14 @@ define i16 @test_urem_even(i16 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: ubfx w8, w0, #1, #15 -; CHECK-NEXT: mov w9, #18725 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: lsr w8, w8, #17 -; CHECK-NEXT: mov w9, #14 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: tst w8, #0xffff -; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: and w9, w8, #0xfffc +; CHECK-NEXT: lsr w9, w9, #1 +; CHECK-NEXT: bfi w9, w8, #15, #17 +; CHECK-NEXT: ubfx w8, w9, #1, #15 +; CHECK-NEXT: cmp w8, #2340 // =2340 +; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret %urem = urem i16 %X, 14 %cmp = icmp ne i16 %urem, 0 diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll @@ -5,17 +5,12 @@ ; CHECK-LABEL: test_srem_odd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_bfe_i32 v0, v0, 0, 29 -; CHECK-NEXT: s_mov_b32 s5, 0xa57eb503 -; CHECK-NEXT: s_movk_i32 s4, 0x63 -; CHECK-NEXT: v_mul_hi_i32 v1, v0, s5 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v1 -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 6, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, s4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_mov_b32 s4, 0x1f5a814b +; CHECK-NEXT: s_mov_b32 s5, 0x52bf5b +; CHECK-NEXT: v_mul_lo_u32 v0, v0, s4 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, 0x295fad, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %srem = srem i29 %X, 99 diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll @@ -5,13 +5,12 @@ ; CHECK-LABEL: test_urem_odd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v0, 0x1fff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0xcccccccd -; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 2, v1 -; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_movk_i32 s4, 0x1fff +; CHECK-NEXT: s_movk_i32 s5, 0x667 +; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_mul_u32_u24_e32 v0, 0xccd, v0 +; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %urem = urem i13 %X, 5 @@ -23,14 +22,14 @@ ; CHECK-LABEL: test_urem_even: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 0x7ffffff, v0 +; CHECK-NEXT: s_mov_b32 s4, 0x6db6db7 +; CHECK-NEXT: s_mov_b32 s5, 0x924925 +; CHECK-NEXT: v_mul_lo_u32 v0, v0, s4 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 26, v0 ; CHECK-NEXT: v_bfe_u32 v0, v0, 1, 26 -; CHECK-NEXT: s_mov_b32 s4, 0x92492493 -; CHECK-NEXT: v_mul_hi_u32 v0, v0, s4 -; CHECK-NEXT: v_lshrrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: v_mul_u32_u24_e32 v0, 14, v0 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s5, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %urem = urem i27 %X, 14 @@ -43,12 +42,9 @@ ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v0, 15, v0 -; CHECK-NEXT: s_mov_b32 s4, 0xcccccccd -; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 2, v1 -; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_mul_u32_u24_e32 v0, 13, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 15, v0 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 3, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %urem = urem i4 %X, 5 @@ -60,13 +56,11 @@ ; CHECK-LABEL: test_urem_negative_odd: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v0, 0x1ff, v0 -; CHECK-NEXT: s_mov_b32 s4, 0x2050c9f9 -; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 6, v1 -; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x1fb, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_movk_i32 s4, 0x1ff +; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_mul_u32_u24_e32 v0, 0x133, v0 +; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc, 1, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] %urem = urem i9 %X, -5 diff --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll @@ -10,32 +10,30 @@ ; ARM5-LABEL: test_srem_odd: ; ARM5: @ %bb.0: ; ARM5-NEXT: ldr r2, .LCPI0_1 -; ARM5-NEXT: lsl r0, r0, #3 -; ARM5-NEXT: asr r0, r0, #3 ; ARM5-NEXT: ldr r1, .LCPI0_0 ; ARM5-NEXT: mla r3, r0, r2, r1 -; ARM5-NEXT: ldr r1, .LCPI0_2 +; ARM5-NEXT: ldr r2, .LCPI0_2 ; ARM5-NEXT: mov r0, #0 -; ARM5-NEXT: cmp r3, r1 +; ARM5-NEXT: bic r1, r3, #-536870912 +; ARM5-NEXT: cmp r1, r2 ; ARM5-NEXT: movlo r0, #1 ; ARM5-NEXT: bx lr ; ARM5-NEXT: .p2align 2 ; ARM5-NEXT: @ %bb.1: ; ARM5-NEXT: .LCPI0_0: -; ARM5-NEXT: .long 21691754 @ 0x14afd6a +; ARM5-NEXT: .long 2711469 @ 0x295fad ; ARM5-NEXT: .LCPI0_1: -; ARM5-NEXT: .long 3210379595 @ 0xbf5a814b +; ARM5-NEXT: .long 526025035 @ 0x1f5a814b ; ARM5-NEXT: .LCPI0_2: -; ARM5-NEXT: .long 43383509 @ 0x295fad5 +; ARM5-NEXT: .long 5422939 @ 0x52bf5b ; ; ARM6-LABEL: test_srem_odd: ; ARM6: @ %bb.0: ; ARM6-NEXT: ldr r2, .LCPI0_1 -; ARM6-NEXT: lsl r0, r0, #3 -; ARM6-NEXT: asr r0, r0, #3 ; ARM6-NEXT: ldr r1, .LCPI0_0 -; ARM6-NEXT: mla r1, r0, r2, r1 +; ARM6-NEXT: mla r0, r0, r2, r1 ; ARM6-NEXT: ldr r2, .LCPI0_2 +; ARM6-NEXT: bic r1, r0, #-536870912 ; ARM6-NEXT: mov r0, #0 ; ARM6-NEXT: cmp r1, r2 ; ARM6-NEXT: movlo r0, #1 @@ -43,22 +41,22 @@ ; ARM6-NEXT: .p2align 2 ; ARM6-NEXT: @ %bb.1: ; ARM6-NEXT: .LCPI0_0: -; ARM6-NEXT: .long 21691754 @ 0x14afd6a +; ARM6-NEXT: .long 2711469 @ 0x295fad ; ARM6-NEXT: .LCPI0_1: -; ARM6-NEXT: .long 3210379595 @ 0xbf5a814b +; ARM6-NEXT: .long 526025035 @ 0x1f5a814b ; ARM6-NEXT: .LCPI0_2: -; ARM6-NEXT: .long 43383509 @ 0x295fad5 +; ARM6-NEXT: .long 5422939 @ 0x52bf5b ; ; ARM7-LABEL: test_srem_odd: ; ARM7: @ %bb.0: -; ARM7-NEXT: movw r1, #64874 +; ARM7-NEXT: movw r1, #24493 ; ARM7-NEXT: movw r2, #33099 -; ARM7-NEXT: sbfx r0, r0, #0, #29 -; ARM7-NEXT: movt r1, #330 -; ARM7-NEXT: movt r2, #48986 -; ARM7-NEXT: mla r1, r0, r2, r1 -; ARM7-NEXT: movw r2, #64213 -; ARM7-NEXT: movt r2, #661 +; ARM7-NEXT: movt r1, #41 +; ARM7-NEXT: movt r2, #8026 +; ARM7-NEXT: mla r0, r0, r2, r1 +; ARM7-NEXT: movw r2, #48987 +; ARM7-NEXT: movt r2, #82 +; ARM7-NEXT: bic r1, r0, #-536870912 ; ARM7-NEXT: mov r0, #0 ; ARM7-NEXT: cmp r1, r2 ; ARM7-NEXT: movwlo r0, #1 @@ -66,14 +64,14 @@ ; ; ARM8-LABEL: test_srem_odd: ; ARM8: @ %bb.0: -; ARM8-NEXT: movw r1, #64874 +; ARM8-NEXT: movw r1, #24493 ; ARM8-NEXT: movw r2, #33099 -; ARM8-NEXT: sbfx r0, r0, #0, #29 -; ARM8-NEXT: movt r1, #330 -; ARM8-NEXT: movt r2, #48986 -; ARM8-NEXT: mla r1, r0, r2, r1 -; ARM8-NEXT: movw r2, #64213 -; ARM8-NEXT: movt r2, #661 +; ARM8-NEXT: movt r1, #41 +; ARM8-NEXT: movt r2, #8026 +; ARM8-NEXT: mla r0, r0, r2, r1 +; ARM8-NEXT: movw r2, #48987 +; ARM8-NEXT: movt r2, #82 +; ARM8-NEXT: bic r1, r0, #-536870912 ; ARM8-NEXT: mov r0, #0 ; ARM8-NEXT: cmp r1, r2 ; ARM8-NEXT: movwlo r0, #1 @@ -81,14 +79,14 @@ ; ; NEON7-LABEL: test_srem_odd: ; NEON7: @ %bb.0: -; NEON7-NEXT: movw r1, #64874 +; NEON7-NEXT: movw r1, #24493 ; NEON7-NEXT: movw r2, #33099 -; NEON7-NEXT: sbfx r0, r0, #0, #29 -; NEON7-NEXT: movt r1, #330 -; NEON7-NEXT: movt r2, #48986 -; NEON7-NEXT: mla r1, r0, r2, r1 -; NEON7-NEXT: movw r2, #64213 -; NEON7-NEXT: movt r2, #661 +; NEON7-NEXT: movt r1, #41 +; NEON7-NEXT: movt r2, #8026 +; NEON7-NEXT: mla r0, r0, r2, r1 +; NEON7-NEXT: movw r2, #48987 +; NEON7-NEXT: movt r2, #82 +; NEON7-NEXT: bic r1, r0, #-536870912 ; NEON7-NEXT: mov r0, #0 ; NEON7-NEXT: cmp r1, r2 ; NEON7-NEXT: movwlo r0, #1 @@ -96,14 +94,14 @@ ; ; NEON8-LABEL: test_srem_odd: ; NEON8: @ %bb.0: -; NEON8-NEXT: movw r1, #64874 +; NEON8-NEXT: movw r1, #24493 ; NEON8-NEXT: movw r2, #33099 -; NEON8-NEXT: sbfx r0, r0, #0, #29 -; NEON8-NEXT: movt r1, #330 -; NEON8-NEXT: movt r2, #48986 -; NEON8-NEXT: mla r1, r0, r2, r1 -; NEON8-NEXT: movw r2, #64213 -; NEON8-NEXT: movt r2, #661 +; NEON8-NEXT: movt r1, #41 +; NEON8-NEXT: movt r2, #8026 +; NEON8-NEXT: mla r0, r0, r2, r1 +; NEON8-NEXT: movw r2, #48987 +; NEON8-NEXT: movt r2, #82 +; NEON8-NEXT: bic r1, r0, #-536870912 ; NEON8-NEXT: mov r0, #0 ; NEON8-NEXT: cmp r1, r2 ; NEON8-NEXT: movwlo r0, #1 diff --git a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/ARM/urem-seteq-illegal-types.ll @@ -9,90 +9,74 @@ define i1 @test_urem_odd(i13 %X) nounwind { ; ARM5-LABEL: test_urem_odd: ; ARM5: @ %bb.0: -; ARM5-NEXT: mov r1, #255 -; ARM5-NEXT: orr r1, r1, #7936 -; ARM5-NEXT: and r0, r0, r1 -; ARM5-NEXT: ldr r1, .LCPI0_0 +; ARM5-NEXT: mov r1, #205 +; ARM5-NEXT: orr r1, r1, #3072 ; ARM5-NEXT: mul r2, r0, r1 -; ARM5-NEXT: ldr r1, .LCPI0_1 +; ARM5-NEXT: mov r0, #255 +; ARM5-NEXT: orr r0, r0, #7936 +; ARM5-NEXT: and r1, r2, r0 +; ARM5-NEXT: mov r2, #103 +; ARM5-NEXT: orr r2, r2, #1536 ; ARM5-NEXT: mov r0, #0 -; ARM5-NEXT: cmp r2, r1 +; ARM5-NEXT: cmp r1, r2 ; ARM5-NEXT: movlo r0, #1 ; ARM5-NEXT: bx lr -; ARM5-NEXT: .p2align 2 -; ARM5-NEXT: @ %bb.1: -; ARM5-NEXT: .LCPI0_0: -; ARM5-NEXT: .long 3435973837 @ 0xcccccccd -; ARM5-NEXT: .LCPI0_1: -; ARM5-NEXT: .long 858993460 @ 0x33333334 ; ; ARM6-LABEL: test_urem_odd: ; ARM6: @ %bb.0: +; ARM6-NEXT: mov r1, #205 +; ARM6-NEXT: mov r2, #103 +; ARM6-NEXT: orr r1, r1, #3072 +; ARM6-NEXT: orr r2, r2, #1536 +; ARM6-NEXT: mul r0, r0, r1 ; ARM6-NEXT: mov r1, #255 -; ARM6-NEXT: ldr r2, .LCPI0_1 ; ARM6-NEXT: orr r1, r1, #7936 -; ARM6-NEXT: and r0, r0, r1 -; ARM6-NEXT: ldr r1, .LCPI0_0 -; ARM6-NEXT: mul r1, r0, r1 +; ARM6-NEXT: and r1, r0, r1 ; ARM6-NEXT: mov r0, #0 ; ARM6-NEXT: cmp r1, r2 ; ARM6-NEXT: movlo r0, #1 ; ARM6-NEXT: bx lr -; ARM6-NEXT: .p2align 2 -; ARM6-NEXT: @ %bb.1: -; ARM6-NEXT: .LCPI0_0: -; ARM6-NEXT: .long 3435973837 @ 0xcccccccd -; ARM6-NEXT: .LCPI0_1: -; ARM6-NEXT: .long 858993460 @ 0x33333334 ; ; ARM7-LABEL: test_urem_odd: ; ARM7: @ %bb.0: -; ARM7-NEXT: movw r1, #52429 -; ARM7-NEXT: bfc r0, #13, #19 -; ARM7-NEXT: movt r1, #52428 -; ARM7-NEXT: movw r2, #13108 +; ARM7-NEXT: movw r1, #3277 +; ARM7-NEXT: movw r2, #1639 ; ARM7-NEXT: mul r1, r0, r1 -; ARM7-NEXT: movt r2, #13107 ; ARM7-NEXT: mov r0, #0 +; ARM7-NEXT: bfc r1, #13, #19 ; ARM7-NEXT: cmp r1, r2 ; ARM7-NEXT: movwlo r0, #1 ; ARM7-NEXT: bx lr ; ; ARM8-LABEL: test_urem_odd: ; ARM8: @ %bb.0: -; ARM8-NEXT: movw r1, #52429 -; ARM8-NEXT: bfc r0, #13, #19 -; ARM8-NEXT: movt r1, #52428 -; ARM8-NEXT: movw r2, #13108 +; ARM8-NEXT: movw r1, #3277 +; ARM8-NEXT: movw r2, #1639 ; ARM8-NEXT: mul r1, r0, r1 -; ARM8-NEXT: movt r2, #13107 ; ARM8-NEXT: mov r0, #0 +; ARM8-NEXT: bfc r1, #13, #19 ; ARM8-NEXT: cmp r1, r2 ; ARM8-NEXT: movwlo r0, #1 ; ARM8-NEXT: bx lr ; ; NEON7-LABEL: test_urem_odd: ; NEON7: @ %bb.0: -; NEON7-NEXT: movw r1, #52429 -; NEON7-NEXT: bfc r0, #13, #19 -; NEON7-NEXT: movt r1, #52428 -; NEON7-NEXT: movw r2, #13108 +; NEON7-NEXT: movw r1, #3277 +; NEON7-NEXT: movw r2, #1639 ; NEON7-NEXT: mul r1, r0, r1 -; NEON7-NEXT: movt r2, #13107 ; NEON7-NEXT: mov r0, #0 +; NEON7-NEXT: bfc r1, #13, #19 ; NEON7-NEXT: cmp r1, r2 ; NEON7-NEXT: movwlo r0, #1 ; NEON7-NEXT: bx lr ; ; NEON8-LABEL: test_urem_odd: ; NEON8: @ %bb.0: -; NEON8-NEXT: movw r1, #52429 -; NEON8-NEXT: bfc r0, #13, #19 -; NEON8-NEXT: movt r1, #52428 -; NEON8-NEXT: movw r2, #13108 +; NEON8-NEXT: movw r1, #3277 +; NEON8-NEXT: movw r2, #1639 ; NEON8-NEXT: mul r1, r0, r1 -; NEON8-NEXT: movt r2, #13107 ; NEON8-NEXT: mov r0, #0 +; NEON8-NEXT: bfc r1, #13, #19 ; NEON8-NEXT: cmp r1, r2 ; NEON8-NEXT: movwlo r0, #1 ; NEON8-NEXT: bx lr @@ -105,28 +89,32 @@ ; ARM5-LABEL: test_urem_even: ; ARM5: @ %bb.0: ; ARM5-NEXT: ldr r1, .LCPI1_0 -; ARM5-NEXT: bic r0, r0, #-134217728 ; ARM5-NEXT: mul r2, r0, r1 -; ARM5-NEXT: mov r0, #0 -; ARM5-NEXT: ror r1, r2, #1 +; ARM5-NEXT: bic r0, r2, #-134217727 +; ARM5-NEXT: lsr r0, r0, #1 +; ARM5-NEXT: orr r0, r0, r2, lsl #26 ; ARM5-NEXT: ldr r2, .LCPI1_1 +; ARM5-NEXT: bic r1, r0, #-134217728 +; ARM5-NEXT: mov r0, #0 ; ARM5-NEXT: cmp r1, r2 ; ARM5-NEXT: movlo r0, #1 ; ARM5-NEXT: bx lr ; ARM5-NEXT: .p2align 2 ; ARM5-NEXT: @ %bb.1: ; ARM5-NEXT: .LCPI1_0: -; ARM5-NEXT: .long 3067833783 @ 0xb6db6db7 +; ARM5-NEXT: .long 115043767 @ 0x6db6db7 ; ARM5-NEXT: .LCPI1_1: -; ARM5-NEXT: .long 306783379 @ 0x12492493 +; ARM5-NEXT: .long 9586981 @ 0x924925 ; ; ARM6-LABEL: test_urem_even: ; ARM6: @ %bb.0: ; ARM6-NEXT: ldr r1, .LCPI1_0 -; ARM6-NEXT: bic r0, r0, #-134217728 ; ARM6-NEXT: ldr r2, .LCPI1_1 ; ARM6-NEXT: mul r0, r0, r1 -; ARM6-NEXT: ror r1, r0, #1 +; ARM6-NEXT: bic r1, r0, #-134217727 +; ARM6-NEXT: lsr r1, r1, #1 +; ARM6-NEXT: orr r0, r1, r0, lsl #26 +; ARM6-NEXT: bic r1, r0, #-134217728 ; ARM6-NEXT: mov r0, #0 ; ARM6-NEXT: cmp r1, r2 ; ARM6-NEXT: movlo r0, #1 @@ -134,19 +122,20 @@ ; ARM6-NEXT: .p2align 2 ; ARM6-NEXT: @ %bb.1: ; ARM6-NEXT: .LCPI1_0: -; ARM6-NEXT: .long 3067833783 @ 0xb6db6db7 +; ARM6-NEXT: .long 115043767 @ 0x6db6db7 ; ARM6-NEXT: .LCPI1_1: -; ARM6-NEXT: .long 306783379 @ 0x12492493 +; ARM6-NEXT: .long 9586981 @ 0x924925 ; ; ARM7-LABEL: test_urem_even: ; ARM7: @ %bb.0: ; ARM7-NEXT: movw r1, #28087 -; ARM7-NEXT: bic r0, r0, #-134217728 -; ARM7-NEXT: movt r1, #46811 -; ARM7-NEXT: movw r2, #9363 +; ARM7-NEXT: movw r2, #18725 +; ARM7-NEXT: movt r1, #1755 +; ARM7-NEXT: movt r2, #146 ; ARM7-NEXT: mul r0, r0, r1 -; ARM7-NEXT: movt r2, #4681 -; ARM7-NEXT: ror r1, r0, #1 +; ARM7-NEXT: ubfx r1, r0, #1, #26 +; ARM7-NEXT: orr r0, r1, r0, lsl #26 +; ARM7-NEXT: bic r1, r0, #-134217728 ; ARM7-NEXT: mov r0, #0 ; ARM7-NEXT: cmp r1, r2 ; ARM7-NEXT: movwlo r0, #1 @@ -155,12 +144,13 @@ ; ARM8-LABEL: test_urem_even: ; ARM8: @ %bb.0: ; ARM8-NEXT: movw r1, #28087 -; ARM8-NEXT: bic r0, r0, #-134217728 -; ARM8-NEXT: movt r1, #46811 -; ARM8-NEXT: movw r2, #9363 +; ARM8-NEXT: movw r2, #18725 +; ARM8-NEXT: movt r1, #1755 +; ARM8-NEXT: movt r2, #146 ; ARM8-NEXT: mul r0, r0, r1 -; ARM8-NEXT: movt r2, #4681 -; ARM8-NEXT: ror r1, r0, #1 +; ARM8-NEXT: ubfx r1, r0, #1, #26 +; ARM8-NEXT: orr r0, r1, r0, lsl #26 +; ARM8-NEXT: bic r1, r0, #-134217728 ; ARM8-NEXT: mov r0, #0 ; ARM8-NEXT: cmp r1, r2 ; ARM8-NEXT: movwlo r0, #1 @@ -169,12 +159,13 @@ ; NEON7-LABEL: test_urem_even: ; NEON7: @ %bb.0: ; NEON7-NEXT: movw r1, #28087 -; NEON7-NEXT: bic r0, r0, #-134217728 -; NEON7-NEXT: movt r1, #46811 -; NEON7-NEXT: movw r2, #9363 +; NEON7-NEXT: movw r2, #18725 +; NEON7-NEXT: movt r1, #1755 +; NEON7-NEXT: movt r2, #146 ; NEON7-NEXT: mul r0, r0, r1 -; NEON7-NEXT: movt r2, #4681 -; NEON7-NEXT: ror r1, r0, #1 +; NEON7-NEXT: ubfx r1, r0, #1, #26 +; NEON7-NEXT: orr r0, r1, r0, lsl #26 +; NEON7-NEXT: bic r1, r0, #-134217728 ; NEON7-NEXT: mov r0, #0 ; NEON7-NEXT: cmp r1, r2 ; NEON7-NEXT: movwlo r0, #1 @@ -183,12 +174,13 @@ ; NEON8-LABEL: test_urem_even: ; NEON8: @ %bb.0: ; NEON8-NEXT: movw r1, #28087 -; NEON8-NEXT: bic r0, r0, #-134217728 -; NEON8-NEXT: movt r1, #46811 -; NEON8-NEXT: movw r2, #9363 +; NEON8-NEXT: movw r2, #18725 +; NEON8-NEXT: movt r1, #1755 +; NEON8-NEXT: movt r2, #146 ; NEON8-NEXT: mul r0, r0, r1 -; NEON8-NEXT: movt r2, #4681 -; NEON8-NEXT: ror r1, r0, #1 +; NEON8-NEXT: ubfx r1, r0, #1, #26 +; NEON8-NEXT: orr r0, r1, r0, lsl #26 +; NEON8-NEXT: bic r1, r0, #-134217728 ; NEON8-NEXT: mov r0, #0 ; NEON8-NEXT: cmp r1, r2 ; NEON8-NEXT: movwlo r0, #1 @@ -201,87 +193,61 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; ARM5-LABEL: test_urem_odd_setne: ; ARM5: @ %bb.0: -; ARM5-NEXT: ldr r1, .LCPI2_0 -; ARM5-NEXT: and r0, r0, #15 +; ARM5-NEXT: mov r1, #13 ; ARM5-NEXT: mul r2, r0, r1 -; ARM5-NEXT: ldr r1, .LCPI2_1 ; ARM5-NEXT: mov r0, #0 -; ARM5-NEXT: cmp r2, r1 +; ARM5-NEXT: and r1, r2, #15 +; ARM5-NEXT: cmp r1, #3 ; ARM5-NEXT: movhi r0, #1 ; ARM5-NEXT: bx lr -; ARM5-NEXT: .p2align 2 -; ARM5-NEXT: @ %bb.1: -; ARM5-NEXT: .LCPI2_0: -; ARM5-NEXT: .long 3435973837 @ 0xcccccccd -; ARM5-NEXT: .LCPI2_1: -; ARM5-NEXT: .long 858993459 @ 0x33333333 ; ; ARM6-LABEL: test_urem_odd_setne: ; ARM6: @ %bb.0: -; ARM6-NEXT: ldr r1, .LCPI2_0 -; ARM6-NEXT: and r0, r0, #15 -; ARM6-NEXT: ldr r2, .LCPI2_1 -; ARM6-NEXT: mul r1, r0, r1 +; ARM6-NEXT: mov r1, #13 +; ARM6-NEXT: mul r0, r0, r1 +; ARM6-NEXT: and r1, r0, #15 ; ARM6-NEXT: mov r0, #0 -; ARM6-NEXT: cmp r1, r2 +; ARM6-NEXT: cmp r1, #3 ; ARM6-NEXT: movhi r0, #1 ; ARM6-NEXT: bx lr -; ARM6-NEXT: .p2align 2 -; ARM6-NEXT: @ %bb.1: -; ARM6-NEXT: .LCPI2_0: -; ARM6-NEXT: .long 3435973837 @ 0xcccccccd -; ARM6-NEXT: .LCPI2_1: -; ARM6-NEXT: .long 858993459 @ 0x33333333 ; ; ARM7-LABEL: test_urem_odd_setne: ; ARM7: @ %bb.0: -; ARM7-NEXT: movw r1, #52429 -; ARM7-NEXT: and r0, r0, #15 -; ARM7-NEXT: movt r1, #52428 -; ARM7-NEXT: movw r2, #13107 -; ARM7-NEXT: mul r1, r0, r1 -; ARM7-NEXT: movt r2, #13107 +; ARM7-NEXT: mov r1, #13 +; ARM7-NEXT: mul r0, r0, r1 +; ARM7-NEXT: and r1, r0, #15 ; ARM7-NEXT: mov r0, #0 -; ARM7-NEXT: cmp r1, r2 +; ARM7-NEXT: cmp r1, #3 ; ARM7-NEXT: movwhi r0, #1 ; ARM7-NEXT: bx lr ; ; ARM8-LABEL: test_urem_odd_setne: ; ARM8: @ %bb.0: -; ARM8-NEXT: movw r1, #52429 -; ARM8-NEXT: and r0, r0, #15 -; ARM8-NEXT: movt r1, #52428 -; ARM8-NEXT: movw r2, #13107 -; ARM8-NEXT: mul r1, r0, r1 -; ARM8-NEXT: movt r2, #13107 +; ARM8-NEXT: mov r1, #13 +; ARM8-NEXT: mul r0, r0, r1 +; ARM8-NEXT: and r1, r0, #15 ; ARM8-NEXT: mov r0, #0 -; ARM8-NEXT: cmp r1, r2 +; ARM8-NEXT: cmp r1, #3 ; ARM8-NEXT: movwhi r0, #1 ; ARM8-NEXT: bx lr ; ; NEON7-LABEL: test_urem_odd_setne: ; NEON7: @ %bb.0: -; NEON7-NEXT: movw r1, #52429 -; NEON7-NEXT: and r0, r0, #15 -; NEON7-NEXT: movt r1, #52428 -; NEON7-NEXT: movw r2, #13107 -; NEON7-NEXT: mul r1, r0, r1 -; NEON7-NEXT: movt r2, #13107 +; NEON7-NEXT: mov r1, #13 +; NEON7-NEXT: mul r0, r0, r1 +; NEON7-NEXT: and r1, r0, #15 ; NEON7-NEXT: mov r0, #0 -; NEON7-NEXT: cmp r1, r2 +; NEON7-NEXT: cmp r1, #3 ; NEON7-NEXT: movwhi r0, #1 ; NEON7-NEXT: bx lr ; ; NEON8-LABEL: test_urem_odd_setne: ; NEON8: @ %bb.0: -; NEON8-NEXT: movw r1, #52429 -; NEON8-NEXT: and r0, r0, #15 -; NEON8-NEXT: movt r1, #52428 -; NEON8-NEXT: movw r2, #13107 -; NEON8-NEXT: mul r1, r0, r1 -; NEON8-NEXT: movt r2, #13107 +; NEON8-NEXT: mov r1, #13 +; NEON8-NEXT: mul r0, r0, r1 +; NEON8-NEXT: and r1, r0, #15 ; NEON8-NEXT: mov r0, #0 -; NEON8-NEXT: cmp r1, r2 +; NEON8-NEXT: cmp r1, #3 ; NEON8-NEXT: movwhi r0, #1 ; NEON8-NEXT: bx lr %urem = urem i4 %X, 5 @@ -292,91 +258,67 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { ; ARM5-LABEL: test_urem_negative_odd: ; ARM5: @ %bb.0: -; ARM5-NEXT: mov r1, #255 +; ARM5-NEXT: mov r1, #51 ; ARM5-NEXT: orr r1, r1, #256 -; ARM5-NEXT: and r0, r0, r1 -; ARM5-NEXT: ldr r1, .LCPI3_0 ; ARM5-NEXT: mul r2, r0, r1 -; ARM5-NEXT: ldr r1, .LCPI3_1 +; ARM5-NEXT: mov r0, #255 +; ARM5-NEXT: orr r0, r0, #256 +; ARM5-NEXT: and r1, r2, r0 ; ARM5-NEXT: mov r0, #0 -; ARM5-NEXT: cmp r2, r1 +; ARM5-NEXT: cmp r1, #1 ; ARM5-NEXT: movhi r0, #1 ; ARM5-NEXT: bx lr -; ARM5-NEXT: .p2align 2 -; ARM5-NEXT: @ %bb.1: -; ARM5-NEXT: .LCPI3_0: -; ARM5-NEXT: .long 2837897523 @ 0xa926e133 -; ARM5-NEXT: .LCPI3_1: -; ARM5-NEXT: .long 8471335 @ 0x814327 ; ; ARM6-LABEL: test_urem_negative_odd: ; ARM6: @ %bb.0: +; ARM6-NEXT: mov r1, #51 +; ARM6-NEXT: orr r1, r1, #256 +; ARM6-NEXT: mul r0, r0, r1 ; ARM6-NEXT: mov r1, #255 -; ARM6-NEXT: ldr r2, .LCPI3_1 ; ARM6-NEXT: orr r1, r1, #256 -; ARM6-NEXT: and r0, r0, r1 -; ARM6-NEXT: ldr r1, .LCPI3_0 -; ARM6-NEXT: mul r1, r0, r1 +; ARM6-NEXT: and r1, r0, r1 ; ARM6-NEXT: mov r0, #0 -; ARM6-NEXT: cmp r1, r2 +; ARM6-NEXT: cmp r1, #1 ; ARM6-NEXT: movhi r0, #1 ; ARM6-NEXT: bx lr -; ARM6-NEXT: .p2align 2 -; ARM6-NEXT: @ %bb.1: -; ARM6-NEXT: .LCPI3_0: -; ARM6-NEXT: .long 2837897523 @ 0xa926e133 -; ARM6-NEXT: .LCPI3_1: -; ARM6-NEXT: .long 8471335 @ 0x814327 ; ; ARM7-LABEL: test_urem_negative_odd: ; ARM7: @ %bb.0: -; ARM7-NEXT: movw r1, #57651 -; ARM7-NEXT: bfc r0, #9, #23 -; ARM7-NEXT: movt r1, #43302 -; ARM7-NEXT: movw r2, #17191 +; ARM7-NEXT: movw r1, #307 ; ARM7-NEXT: mul r1, r0, r1 -; ARM7-NEXT: movt r2, #129 ; ARM7-NEXT: mov r0, #0 -; ARM7-NEXT: cmp r1, r2 +; ARM7-NEXT: bfc r1, #9, #23 +; ARM7-NEXT: cmp r1, #1 ; ARM7-NEXT: movwhi r0, #1 ; ARM7-NEXT: bx lr ; ; ARM8-LABEL: test_urem_negative_odd: ; ARM8: @ %bb.0: -; ARM8-NEXT: movw r1, #57651 -; ARM8-NEXT: bfc r0, #9, #23 -; ARM8-NEXT: movt r1, #43302 -; ARM8-NEXT: movw r2, #17191 +; ARM8-NEXT: movw r1, #307 ; ARM8-NEXT: mul r1, r0, r1 -; ARM8-NEXT: movt r2, #129 ; ARM8-NEXT: mov r0, #0 -; ARM8-NEXT: cmp r1, r2 +; ARM8-NEXT: bfc r1, #9, #23 +; ARM8-NEXT: cmp r1, #1 ; ARM8-NEXT: movwhi r0, #1 ; ARM8-NEXT: bx lr ; ; NEON7-LABEL: test_urem_negative_odd: ; NEON7: @ %bb.0: -; NEON7-NEXT: movw r1, #57651 -; NEON7-NEXT: bfc r0, #9, #23 -; NEON7-NEXT: movt r1, #43302 -; NEON7-NEXT: movw r2, #17191 +; NEON7-NEXT: movw r1, #307 ; NEON7-NEXT: mul r1, r0, r1 -; NEON7-NEXT: movt r2, #129 ; NEON7-NEXT: mov r0, #0 -; NEON7-NEXT: cmp r1, r2 +; NEON7-NEXT: bfc r1, #9, #23 +; NEON7-NEXT: cmp r1, #1 ; NEON7-NEXT: movwhi r0, #1 ; NEON7-NEXT: bx lr ; ; NEON8-LABEL: test_urem_negative_odd: ; NEON8: @ %bb.0: -; NEON8-NEXT: movw r1, #57651 -; NEON8-NEXT: bfc r0, #9, #23 -; NEON8-NEXT: movt r1, #43302 -; NEON8-NEXT: movw r2, #17191 +; NEON8-NEXT: movw r1, #307 ; NEON8-NEXT: mul r1, r0, r1 -; NEON8-NEXT: movt r2, #129 ; NEON8-NEXT: mov r0, #0 -; NEON8-NEXT: cmp r1, r2 +; NEON8-NEXT: bfc r1, #9, #23 +; NEON8-NEXT: cmp r1, #1 ; NEON8-NEXT: movwhi r0, #1 ; NEON8-NEXT: bx lr %urem = urem i9 %X, -5 @@ -388,289 +330,291 @@ ; ARM5-LABEL: test_urem_vec: ; ARM5: @ %bb.0: ; ARM5-NEXT: push {r4, r5, r11, lr} +; ARM5-NEXT: mov r3, #183 +; ARM5-NEXT: mvn r12, #182 +; ARM5-NEXT: orr r3, r3, #1280 +; ARM5-NEXT: sub r12, r12, #1280 +; ARM5-NEXT: mov r4, #51 +; ARM5-NEXT: mla lr, r1, r3, r12 ; ARM5-NEXT: mov r12, #255 -; ARM5-NEXT: ldr r3, .LCPI4_1 ; ARM5-NEXT: orr r12, r12, #1792 -; ARM5-NEXT: ldr lr, .LCPI4_0 -; ARM5-NEXT: and r1, r1, r12 -; ARM5-NEXT: and r2, r2, r12 -; ARM5-NEXT: and r0, r0, r12 -; ARM5-NEXT: mla r4, r1, r3, lr -; ARM5-NEXT: ldr r1, .LCPI4_2 -; ARM5-NEXT: ldr lr, .LCPI4_3 +; ARM5-NEXT: orr r4, r4, #768 ; ARM5-NEXT: mov r3, #0 -; ARM5-NEXT: cmp r4, r1 -; ARM5-NEXT: ldr r4, .LCPI4_4 +; ARM5-NEXT: and r1, lr, r12 +; ARM5-NEXT: mvn lr, #101 +; ARM5-NEXT: sub lr, lr, #1536 +; ARM5-NEXT: cmp r1, #292 +; ARM5-NEXT: mla r5, r2, r4, lr ; ARM5-NEXT: mov r1, #0 ; ARM5-NEXT: movhi r1, #1 -; ARM5-NEXT: mla r5, r2, r4, lr -; ARM5-NEXT: ldr r2, .LCPI4_5 -; ARM5-NEXT: cmp r5, r2 -; ARM5-NEXT: ldr r5, .LCPI4_6 +; ARM5-NEXT: and r2, r5, r12 +; ARM5-NEXT: mov r5, #171 +; ARM5-NEXT: orr r5, r5, #512 +; ARM5-NEXT: cmp r2, #1 ; ARM5-NEXT: mov r2, #0 -; ARM5-NEXT: movhi r2, #1 ; ARM5-NEXT: mul r4, r0, r5 -; ARM5-NEXT: ldr r5, .LCPI4_7 -; ARM5-NEXT: ror r0, r4, #1 -; ARM5-NEXT: cmp r0, r5 +; ARM5-NEXT: mov r0, #1020 +; ARM5-NEXT: orr r0, r0, #1024 +; ARM5-NEXT: mov r5, #254 +; ARM5-NEXT: movhi r2, #1 +; ARM5-NEXT: orr r5, r5, #1792 +; ARM5-NEXT: and r0, r4, r0 +; ARM5-NEXT: lsr r0, r0, #1 +; ARM5-NEXT: orr r0, r0, r4, lsl #10 +; ARM5-NEXT: and r0, r0, r5 +; ARM5-NEXT: lsr r0, r0, #1 +; ARM5-NEXT: cmp r0, #170 ; ARM5-NEXT: movhi r3, #1 ; ARM5-NEXT: mov r0, r3 ; ARM5-NEXT: pop {r4, r5, r11, pc} -; ARM5-NEXT: .p2align 2 -; ARM5-NEXT: @ %bb.1: -; ARM5-NEXT: .LCPI4_0: -; ARM5-NEXT: .long 1227133513 @ 0x49249249 -; ARM5-NEXT: .LCPI4_1: -; ARM5-NEXT: .long 3067833783 @ 0xb6db6db7 -; ARM5-NEXT: .LCPI4_2: -; ARM5-NEXT: .long 613566756 @ 0x24924924 -; ARM5-NEXT: .LCPI4_3: -; ARM5-NEXT: .long 4191955354 @ 0xf9dc299a -; ARM5-NEXT: .LCPI4_4: -; ARM5-NEXT: .long 2198989619 @ 0x8311eb33 -; ARM5-NEXT: .LCPI4_5: -; ARM5-NEXT: .long 2102284 @ 0x20140c -; ARM5-NEXT: .LCPI4_6: -; ARM5-NEXT: .long 2863311531 @ 0xaaaaaaab -; ARM5-NEXT: .LCPI4_7: -; ARM5-NEXT: .long 715827882 @ 0x2aaaaaaa ; ; ARM6-LABEL: test_urem_vec: ; ARM6: @ %bb.0: ; ARM6-NEXT: push {r4, lr} +; ARM6-NEXT: mov r4, #51 +; ARM6-NEXT: mvn lr, #101 +; ARM6-NEXT: orr r4, r4, #768 +; ARM6-NEXT: sub lr, lr, #1536 +; ARM6-NEXT: mov r3, #183 +; ARM6-NEXT: mvn r12, #182 +; ARM6-NEXT: mla r2, r2, r4, lr +; ARM6-NEXT: mov r4, #171 +; ARM6-NEXT: orr r4, r4, #512 +; ARM6-NEXT: orr r3, r3, #1280 +; ARM6-NEXT: sub r12, r12, #1280 +; ARM6-NEXT: mul r0, r0, r4 +; ARM6-NEXT: mov r4, #1020 +; ARM6-NEXT: orr r4, r4, #1024 +; ARM6-NEXT: mla r1, r1, r3, r12 ; ARM6-NEXT: mov r12, #255 -; ARM6-NEXT: ldr r3, .LCPI4_1 ; ARM6-NEXT: orr r12, r12, #1792 -; ARM6-NEXT: ldr lr, .LCPI4_0 -; ARM6-NEXT: and r1, r1, r12 -; ARM6-NEXT: ldr r4, .LCPI4_4 ; ARM6-NEXT: and r2, r2, r12 -; ARM6-NEXT: and r0, r0, r12 -; ARM6-NEXT: mla r1, r1, r3, lr -; ARM6-NEXT: ldr lr, .LCPI4_2 ; ARM6-NEXT: mov r3, #0 -; ARM6-NEXT: cmp r1, lr -; ARM6-NEXT: ldr lr, .LCPI4_3 -; ARM6-NEXT: mla r2, r2, r4, lr -; ARM6-NEXT: ldr r4, .LCPI4_5 +; ARM6-NEXT: and r4, r0, r4 +; ARM6-NEXT: lsr r4, r4, #1 +; ARM6-NEXT: orr r0, r4, r0, lsl #10 +; ARM6-NEXT: mov r4, #254 +; ARM6-NEXT: and r1, r1, r12 +; ARM6-NEXT: orr r4, r4, #1792 +; ARM6-NEXT: cmp r1, #292 ; ARM6-NEXT: mov r1, #0 +; ARM6-NEXT: and r0, r0, r4 ; ARM6-NEXT: movhi r1, #1 -; ARM6-NEXT: cmp r2, r4 -; ARM6-NEXT: ldr r4, .LCPI4_6 +; ARM6-NEXT: cmp r2, #1 ; ARM6-NEXT: mov r2, #0 +; ARM6-NEXT: lsr r0, r0, #1 ; ARM6-NEXT: movhi r2, #1 -; ARM6-NEXT: mul r0, r0, r4 -; ARM6-NEXT: ldr r4, .LCPI4_7 -; ARM6-NEXT: ror r0, r0, #1 -; ARM6-NEXT: cmp r0, r4 +; ARM6-NEXT: cmp r0, #170 ; ARM6-NEXT: movhi r3, #1 ; ARM6-NEXT: mov r0, r3 ; ARM6-NEXT: pop {r4, pc} -; ARM6-NEXT: .p2align 2 -; ARM6-NEXT: @ %bb.1: -; ARM6-NEXT: .LCPI4_0: -; ARM6-NEXT: .long 1227133513 @ 0x49249249 -; ARM6-NEXT: .LCPI4_1: -; ARM6-NEXT: .long 3067833783 @ 0xb6db6db7 -; ARM6-NEXT: .LCPI4_2: -; ARM6-NEXT: .long 613566756 @ 0x24924924 -; ARM6-NEXT: .LCPI4_3: -; ARM6-NEXT: .long 4191955354 @ 0xf9dc299a -; ARM6-NEXT: .LCPI4_4: -; ARM6-NEXT: .long 2198989619 @ 0x8311eb33 -; ARM6-NEXT: .LCPI4_5: -; ARM6-NEXT: .long 2102284 @ 0x20140c -; ARM6-NEXT: .LCPI4_6: -; ARM6-NEXT: .long 2863311531 @ 0xaaaaaaab -; ARM6-NEXT: .LCPI4_7: -; ARM6-NEXT: .long 715827882 @ 0x2aaaaaaa ; ; ARM7-LABEL: test_urem_vec: ; ARM7: @ %bb.0: -; ARM7-NEXT: push {r4, lr} -; ARM7-NEXT: movw r3, #18725 -; ARM7-NEXT: bfc r1, #11, #21 -; ARM7-NEXT: movt r3, #9362 -; ARM7-NEXT: bfc r2, #11, #21 -; ARM7-NEXT: umull r3, r12, r1, r3 -; ARM7-NEXT: bfc r0, #11, #21 -; ARM7-NEXT: movw r3, #25663 -; ARM7-NEXT: movt r3, #160 -; ARM7-NEXT: umull r3, lr, r2, r3 -; ARM7-NEXT: vldr d17, .LCPI4_0 -; ARM7-NEXT: movw r3, #43691 -; ARM7-NEXT: movt r3, #43690 -; ARM7-NEXT: umull r3, r4, r0, r3 -; ARM7-NEXT: sub r3, r1, r12 -; ARM7-NEXT: add r3, r12, r3, lsr #1 -; ARM7-NEXT: lsr r12, r3, #2 -; ARM7-NEXT: sub r3, r2, lr -; ARM7-NEXT: lsr r4, r4, #2 -; ARM7-NEXT: add r4, r4, r4, lsl #1 -; ARM7-NEXT: add r3, lr, r3, lsr #1 -; ARM7-NEXT: sub r0, r0, r4, lsl #1 -; ARM7-NEXT: lsr lr, r3, #10 -; ARM7-NEXT: movw r3, #2043 ; ARM7-NEXT: vmov.16 d16[0], r0 -; ARM7-NEXT: sub r0, r12, r12, lsl #3 -; ARM7-NEXT: mls r2, lr, r3, r2 -; ARM7-NEXT: add r0, r1, r0 -; ARM7-NEXT: vmov.16 d16[1], r0 +; ARM7-NEXT: vldr d17, .LCPI4_0 +; ARM7-NEXT: vmov.16 d16[1], r1 +; ARM7-NEXT: vldr d19, .LCPI4_3 ; ARM7-NEXT: vmov.16 d16[2], r2 +; ARM7-NEXT: vsub.i16 d16, d16, d17 +; ARM7-NEXT: vldr d17, .LCPI4_1 +; ARM7-NEXT: vmul.i16 d16, d16, d17 +; ARM7-NEXT: vldr d17, .LCPI4_2 +; ARM7-NEXT: vneg.s16 d17, d17 +; ARM7-NEXT: vshl.i16 d18, d16, #1 ; ARM7-NEXT: vbic.i16 d16, #0xf800 -; ARM7-NEXT: vceq.i16 d16, d16, d17 -; ARM7-NEXT: vmvn d16, d16 +; ARM7-NEXT: vshl.u16 d16, d16, d17 +; ARM7-NEXT: vshl.u16 d17, d18, d19 +; ARM7-NEXT: vorr d16, d16, d17 +; ARM7-NEXT: vldr d17, .LCPI4_4 +; ARM7-NEXT: vbic.i16 d16, #0xf800 +; ARM7-NEXT: vcgt.u16 d16, d16, d17 ; ARM7-NEXT: vmov.u16 r0, d16[0] ; ARM7-NEXT: vmov.u16 r1, d16[1] ; ARM7-NEXT: vmov.u16 r2, d16[2] -; ARM7-NEXT: pop {r4, pc} +; ARM7-NEXT: bx lr ; ARM7-NEXT: .p2align 3 ; ARM7-NEXT: @ %bb.1: ; ARM7-NEXT: .LCPI4_0: ; ARM7-NEXT: .short 0 @ 0x0 ; ARM7-NEXT: .short 1 @ 0x1 ; ARM7-NEXT: .short 2 @ 0x2 +; ARM7-NEXT: .zero 2 +; ARM7-NEXT: .LCPI4_1: +; ARM7-NEXT: .short 683 @ 0x2ab +; ARM7-NEXT: .short 1463 @ 0x5b7 +; ARM7-NEXT: .short 819 @ 0x333 +; ARM7-NEXT: .zero 2 +; ARM7-NEXT: .LCPI4_2: +; ARM7-NEXT: .short 1 @ 0x1 +; ARM7-NEXT: .short 0 @ 0x0 +; ARM7-NEXT: .short 0 @ 0x0 +; ARM7-NEXT: .short 0 @ 0x0 +; ARM7-NEXT: .LCPI4_3: +; ARM7-NEXT: .short 9 @ 0x9 +; ARM7-NEXT: .short 10 @ 0xa +; ARM7-NEXT: .short 10 @ 0xa +; ARM7-NEXT: .short 10 @ 0xa +; ARM7-NEXT: .LCPI4_4: +; ARM7-NEXT: .short 341 @ 0x155 +; ARM7-NEXT: .short 292 @ 0x124 +; ARM7-NEXT: .short 1 @ 0x1 ; ARM7-NEXT: .short 0 @ 0x0 ; ; ARM8-LABEL: test_urem_vec: ; ARM8: @ %bb.0: -; ARM8-NEXT: push {r4, lr} -; ARM8-NEXT: movw r3, #18725 -; ARM8-NEXT: bfc r1, #11, #21 -; ARM8-NEXT: movt r3, #9362 -; ARM8-NEXT: bfc r2, #11, #21 -; ARM8-NEXT: umull r3, r12, r1, r3 -; ARM8-NEXT: bfc r0, #11, #21 -; ARM8-NEXT: movw r3, #25663 -; ARM8-NEXT: movt r3, #160 -; ARM8-NEXT: umull r3, lr, r2, r3 -; ARM8-NEXT: vldr d17, .LCPI4_0 -; ARM8-NEXT: movw r3, #43691 -; ARM8-NEXT: movt r3, #43690 -; ARM8-NEXT: umull r3, r4, r0, r3 -; ARM8-NEXT: sub r3, r1, r12 -; ARM8-NEXT: add r3, r12, r3, lsr #1 -; ARM8-NEXT: lsr r12, r3, #2 -; ARM8-NEXT: sub r3, r2, lr -; ARM8-NEXT: lsr r4, r4, #2 -; ARM8-NEXT: add r4, r4, r4, lsl #1 -; ARM8-NEXT: add r3, lr, r3, lsr #1 -; ARM8-NEXT: sub r0, r0, r4, lsl #1 -; ARM8-NEXT: lsr lr, r3, #10 -; ARM8-NEXT: movw r3, #2043 ; ARM8-NEXT: vmov.16 d16[0], r0 -; ARM8-NEXT: sub r0, r12, r12, lsl #3 -; ARM8-NEXT: mls r2, lr, r3, r2 -; ARM8-NEXT: add r0, r1, r0 -; ARM8-NEXT: vmov.16 d16[1], r0 +; ARM8-NEXT: vldr d17, .LCPI4_0 +; ARM8-NEXT: vmov.16 d16[1], r1 +; ARM8-NEXT: vldr d19, .LCPI4_3 ; ARM8-NEXT: vmov.16 d16[2], r2 +; ARM8-NEXT: vsub.i16 d16, d16, d17 +; ARM8-NEXT: vldr d17, .LCPI4_1 +; ARM8-NEXT: vmul.i16 d16, d16, d17 +; ARM8-NEXT: vldr d17, .LCPI4_2 +; ARM8-NEXT: vneg.s16 d17, d17 +; ARM8-NEXT: vshl.i16 d18, d16, #1 +; ARM8-NEXT: vbic.i16 d16, #0xf800 +; ARM8-NEXT: vshl.u16 d16, d16, d17 +; ARM8-NEXT: vshl.u16 d17, d18, d19 +; ARM8-NEXT: vorr d16, d16, d17 +; ARM8-NEXT: vldr d17, .LCPI4_4 ; ARM8-NEXT: vbic.i16 d16, #0xf800 -; ARM8-NEXT: vceq.i16 d16, d16, d17 -; ARM8-NEXT: vmvn d16, d16 +; ARM8-NEXT: vcgt.u16 d16, d16, d17 ; ARM8-NEXT: vmov.u16 r0, d16[0] ; ARM8-NEXT: vmov.u16 r1, d16[1] ; ARM8-NEXT: vmov.u16 r2, d16[2] -; ARM8-NEXT: pop {r4, pc} +; ARM8-NEXT: bx lr ; ARM8-NEXT: .p2align 3 ; ARM8-NEXT: @ %bb.1: ; ARM8-NEXT: .LCPI4_0: ; ARM8-NEXT: .short 0 @ 0x0 ; ARM8-NEXT: .short 1 @ 0x1 ; ARM8-NEXT: .short 2 @ 0x2 +; ARM8-NEXT: .zero 2 +; ARM8-NEXT: .LCPI4_1: +; ARM8-NEXT: .short 683 @ 0x2ab +; ARM8-NEXT: .short 1463 @ 0x5b7 +; ARM8-NEXT: .short 819 @ 0x333 +; ARM8-NEXT: .zero 2 +; ARM8-NEXT: .LCPI4_2: +; ARM8-NEXT: .short 1 @ 0x1 +; ARM8-NEXT: .short 0 @ 0x0 +; ARM8-NEXT: .short 0 @ 0x0 +; ARM8-NEXT: .short 0 @ 0x0 +; ARM8-NEXT: .LCPI4_3: +; ARM8-NEXT: .short 9 @ 0x9 +; ARM8-NEXT: .short 10 @ 0xa +; ARM8-NEXT: .short 10 @ 0xa +; ARM8-NEXT: .short 10 @ 0xa +; ARM8-NEXT: .LCPI4_4: +; ARM8-NEXT: .short 341 @ 0x155 +; ARM8-NEXT: .short 292 @ 0x124 +; ARM8-NEXT: .short 1 @ 0x1 ; ARM8-NEXT: .short 0 @ 0x0 ; ; NEON7-LABEL: test_urem_vec: ; NEON7: @ %bb.0: -; NEON7-NEXT: push {r4, lr} -; NEON7-NEXT: movw r3, #18725 -; NEON7-NEXT: bfc r1, #11, #21 -; NEON7-NEXT: movt r3, #9362 -; NEON7-NEXT: bfc r2, #11, #21 -; NEON7-NEXT: umull r3, r12, r1, r3 -; NEON7-NEXT: bfc r0, #11, #21 -; NEON7-NEXT: movw r3, #25663 -; NEON7-NEXT: movt r3, #160 -; NEON7-NEXT: umull r3, lr, r2, r3 -; NEON7-NEXT: vldr d17, .LCPI4_0 -; NEON7-NEXT: movw r3, #43691 -; NEON7-NEXT: movt r3, #43690 -; NEON7-NEXT: umull r3, r4, r0, r3 -; NEON7-NEXT: sub r3, r1, r12 -; NEON7-NEXT: add r3, r12, r3, lsr #1 -; NEON7-NEXT: lsr r12, r3, #2 -; NEON7-NEXT: sub r3, r2, lr -; NEON7-NEXT: lsr r4, r4, #2 -; NEON7-NEXT: add r4, r4, r4, lsl #1 -; NEON7-NEXT: add r3, lr, r3, lsr #1 -; NEON7-NEXT: sub r0, r0, r4, lsl #1 -; NEON7-NEXT: lsr lr, r3, #10 -; NEON7-NEXT: movw r3, #2043 ; NEON7-NEXT: vmov.16 d16[0], r0 -; NEON7-NEXT: sub r0, r12, r12, lsl #3 -; NEON7-NEXT: mls r2, lr, r3, r2 -; NEON7-NEXT: add r0, r1, r0 -; NEON7-NEXT: vmov.16 d16[1], r0 +; NEON7-NEXT: vldr d17, .LCPI4_0 +; NEON7-NEXT: vmov.16 d16[1], r1 +; NEON7-NEXT: vldr d19, .LCPI4_3 ; NEON7-NEXT: vmov.16 d16[2], r2 +; NEON7-NEXT: vsub.i16 d16, d16, d17 +; NEON7-NEXT: vldr d17, .LCPI4_1 +; NEON7-NEXT: vmul.i16 d16, d16, d17 +; NEON7-NEXT: vldr d17, .LCPI4_2 +; NEON7-NEXT: vneg.s16 d17, d17 +; NEON7-NEXT: vshl.i16 d18, d16, #1 +; NEON7-NEXT: vbic.i16 d16, #0xf800 +; NEON7-NEXT: vshl.u16 d16, d16, d17 +; NEON7-NEXT: vshl.u16 d17, d18, d19 +; NEON7-NEXT: vorr d16, d16, d17 +; NEON7-NEXT: vldr d17, .LCPI4_4 ; NEON7-NEXT: vbic.i16 d16, #0xf800 -; NEON7-NEXT: vceq.i16 d16, d16, d17 -; NEON7-NEXT: vmvn d16, d16 +; NEON7-NEXT: vcgt.u16 d16, d16, d17 ; NEON7-NEXT: vmov.u16 r0, d16[0] ; NEON7-NEXT: vmov.u16 r1, d16[1] ; NEON7-NEXT: vmov.u16 r2, d16[2] -; NEON7-NEXT: pop {r4, pc} +; NEON7-NEXT: bx lr ; NEON7-NEXT: .p2align 3 ; NEON7-NEXT: @ %bb.1: ; NEON7-NEXT: .LCPI4_0: ; NEON7-NEXT: .short 0 @ 0x0 ; NEON7-NEXT: .short 1 @ 0x1 ; NEON7-NEXT: .short 2 @ 0x2 +; NEON7-NEXT: .zero 2 +; NEON7-NEXT: .LCPI4_1: +; NEON7-NEXT: .short 683 @ 0x2ab +; NEON7-NEXT: .short 1463 @ 0x5b7 +; NEON7-NEXT: .short 819 @ 0x333 +; NEON7-NEXT: .zero 2 +; NEON7-NEXT: .LCPI4_2: +; NEON7-NEXT: .short 1 @ 0x1 +; NEON7-NEXT: .short 0 @ 0x0 +; NEON7-NEXT: .short 0 @ 0x0 +; NEON7-NEXT: .short 0 @ 0x0 +; NEON7-NEXT: .LCPI4_3: +; NEON7-NEXT: .short 9 @ 0x9 +; NEON7-NEXT: .short 10 @ 0xa +; NEON7-NEXT: .short 10 @ 0xa +; NEON7-NEXT: .short 10 @ 0xa +; NEON7-NEXT: .LCPI4_4: +; NEON7-NEXT: .short 341 @ 0x155 +; NEON7-NEXT: .short 292 @ 0x124 +; NEON7-NEXT: .short 1 @ 0x1 ; NEON7-NEXT: .short 0 @ 0x0 ; ; NEON8-LABEL: test_urem_vec: ; NEON8: @ %bb.0: -; NEON8-NEXT: push {r4, lr} -; NEON8-NEXT: movw r3, #18725 -; NEON8-NEXT: bfc r1, #11, #21 -; NEON8-NEXT: movt r3, #9362 -; NEON8-NEXT: bfc r2, #11, #21 -; NEON8-NEXT: umull r3, r12, r1, r3 -; NEON8-NEXT: bfc r0, #11, #21 -; NEON8-NEXT: movw r3, #25663 -; NEON8-NEXT: movt r3, #160 -; NEON8-NEXT: umull r3, lr, r2, r3 -; NEON8-NEXT: vldr d17, .LCPI4_0 -; NEON8-NEXT: movw r3, #43691 -; NEON8-NEXT: movt r3, #43690 -; NEON8-NEXT: umull r3, r4, r0, r3 -; NEON8-NEXT: sub r3, r1, r12 -; NEON8-NEXT: add r3, r12, r3, lsr #1 -; NEON8-NEXT: lsr r12, r3, #2 -; NEON8-NEXT: sub r3, r2, lr -; NEON8-NEXT: lsr r4, r4, #2 -; NEON8-NEXT: add r4, r4, r4, lsl #1 -; NEON8-NEXT: add r3, lr, r3, lsr #1 -; NEON8-NEXT: sub r0, r0, r4, lsl #1 -; NEON8-NEXT: lsr lr, r3, #10 -; NEON8-NEXT: movw r3, #2043 ; NEON8-NEXT: vmov.16 d16[0], r0 -; NEON8-NEXT: sub r0, r12, r12, lsl #3 -; NEON8-NEXT: mls r2, lr, r3, r2 -; NEON8-NEXT: add r0, r1, r0 -; NEON8-NEXT: vmov.16 d16[1], r0 +; NEON8-NEXT: vldr d17, .LCPI4_0 +; NEON8-NEXT: vmov.16 d16[1], r1 +; NEON8-NEXT: vldr d19, .LCPI4_3 ; NEON8-NEXT: vmov.16 d16[2], r2 +; NEON8-NEXT: vsub.i16 d16, d16, d17 +; NEON8-NEXT: vldr d17, .LCPI4_1 +; NEON8-NEXT: vmul.i16 d16, d16, d17 +; NEON8-NEXT: vldr d17, .LCPI4_2 +; NEON8-NEXT: vneg.s16 d17, d17 +; NEON8-NEXT: vshl.i16 d18, d16, #1 ; NEON8-NEXT: vbic.i16 d16, #0xf800 -; NEON8-NEXT: vceq.i16 d16, d16, d17 -; NEON8-NEXT: vmvn d16, d16 +; NEON8-NEXT: vshl.u16 d16, d16, d17 +; NEON8-NEXT: vshl.u16 d17, d18, d19 +; NEON8-NEXT: vorr d16, d16, d17 +; NEON8-NEXT: vldr d17, .LCPI4_4 +; NEON8-NEXT: vbic.i16 d16, #0xf800 +; NEON8-NEXT: vcgt.u16 d16, d16, d17 ; NEON8-NEXT: vmov.u16 r0, d16[0] ; NEON8-NEXT: vmov.u16 r1, d16[1] ; NEON8-NEXT: vmov.u16 r2, d16[2] -; NEON8-NEXT: pop {r4, pc} +; NEON8-NEXT: bx lr ; NEON8-NEXT: .p2align 3 ; NEON8-NEXT: @ %bb.1: ; NEON8-NEXT: .LCPI4_0: ; NEON8-NEXT: .short 0 @ 0x0 ; NEON8-NEXT: .short 1 @ 0x1 ; NEON8-NEXT: .short 2 @ 0x2 +; NEON8-NEXT: .zero 2 +; NEON8-NEXT: .LCPI4_1: +; NEON8-NEXT: .short 683 @ 0x2ab +; NEON8-NEXT: .short 1463 @ 0x5b7 +; NEON8-NEXT: .short 819 @ 0x333 +; NEON8-NEXT: .zero 2 +; NEON8-NEXT: .LCPI4_2: +; NEON8-NEXT: .short 1 @ 0x1 +; NEON8-NEXT: .short 0 @ 0x0 +; NEON8-NEXT: .short 0 @ 0x0 +; NEON8-NEXT: .short 0 @ 0x0 +; NEON8-NEXT: .LCPI4_3: +; NEON8-NEXT: .short 9 @ 0x9 +; NEON8-NEXT: .short 10 @ 0xa +; NEON8-NEXT: .short 10 @ 0xa +; NEON8-NEXT: .short 10 @ 0xa +; NEON8-NEXT: .LCPI4_4: +; NEON8-NEXT: .short 341 @ 0x155 +; NEON8-NEXT: .short 292 @ 0x124 +; NEON8-NEXT: .short 1 @ 0x1 ; NEON8-NEXT: .short 0 @ 0x0 %urem = urem <3 x i11> %X, %cmp = icmp ne <3 x i11> %urem, @@ -680,86 +624,150 @@ define i1 @test_urem_larger(i63 %X) nounwind { ; ARM5-LABEL: test_urem_larger: ; ARM5: @ %bb.0: -; ARM5-NEXT: push {r11, lr} -; ARM5-NEXT: ldr r2, .LCPI5_0 -; ARM5-NEXT: bic r1, r1, #-2147483648 -; ARM5-NEXT: mov r3, #0 -; ARM5-NEXT: bl __umoddi3 -; ARM5-NEXT: orr r0, r0, r1 -; ARM5-NEXT: clz r0, r0 -; ARM5-NEXT: lsr r0, r0, #5 -; ARM5-NEXT: pop {r11, pc} +; ARM5-NEXT: push {r4, lr} +; ARM5-NEXT: ldr r12, .LCPI5_0 +; ARM5-NEXT: ldr r2, .LCPI5_1 +; ARM5-NEXT: umull r3, lr, r0, r12 +; ARM5-NEXT: mla r4, r0, r2, lr +; ARM5-NEXT: mla r0, r1, r12, r4 +; ARM5-NEXT: bic r0, r0, #-2147483648 +; ARM5-NEXT: lsrs r0, r0, #1 +; ARM5-NEXT: rrx r1, r3 +; ARM5-NEXT: orr r0, r0, r3, lsl #30 +; ARM5-NEXT: ldr r3, .LCPI5_2 +; ARM5-NEXT: bic r2, r0, #-2147483648 +; ARM5-NEXT: mov r0, #0 +; ARM5-NEXT: subs r1, r1, r3 +; ARM5-NEXT: sbcs r1, r2, #1 +; ARM5-NEXT: movlo r0, #1 +; ARM5-NEXT: pop {r4, pc} ; ARM5-NEXT: .p2align 2 ; ARM5-NEXT: @ %bb.1: ; ARM5-NEXT: .LCPI5_0: -; ARM5-NEXT: .long 1234567890 @ 0x499602d2 +; ARM5-NEXT: .long 3456474841 @ 0xce059ed9 +; ARM5-NEXT: .LCPI5_1: +; ARM5-NEXT: .long 790204738 @ 0x2f199142 +; ARM5-NEXT: .LCPI5_2: +; ARM5-NEXT: .long 3175964122 @ 0xbd4d5dda ; ; ARM6-LABEL: test_urem_larger: ; ARM6: @ %bb.0: ; ARM6-NEXT: push {r11, lr} -; ARM6-NEXT: ldr r2, .LCPI5_0 -; ARM6-NEXT: bic r1, r1, #-2147483648 -; ARM6-NEXT: mov r3, #0 -; ARM6-NEXT: bl __umoddi3 -; ARM6-NEXT: orr r0, r0, r1 -; ARM6-NEXT: clz r0, r0 -; ARM6-NEXT: lsr r0, r0, #5 +; ARM6-NEXT: ldr r12, .LCPI5_0 +; ARM6-NEXT: ldr r2, .LCPI5_1 +; ARM6-NEXT: umull r3, lr, r0, r12 +; ARM6-NEXT: mla r0, r0, r2, lr +; ARM6-NEXT: mla r0, r1, r12, r0 +; ARM6-NEXT: bic r0, r0, #-2147483648 +; ARM6-NEXT: lsrs r0, r0, #1 +; ARM6-NEXT: rrx r1, r3 +; ARM6-NEXT: orr r0, r0, r3, lsl #30 +; ARM6-NEXT: ldr r3, .LCPI5_2 +; ARM6-NEXT: bic r2, r0, #-2147483648 +; ARM6-NEXT: mov r0, #0 +; ARM6-NEXT: subs r1, r1, r3 +; ARM6-NEXT: sbcs r1, r2, #1 +; ARM6-NEXT: movlo r0, #1 ; ARM6-NEXT: pop {r11, pc} ; ARM6-NEXT: .p2align 2 ; ARM6-NEXT: @ %bb.1: ; ARM6-NEXT: .LCPI5_0: -; ARM6-NEXT: .long 1234567890 @ 0x499602d2 +; ARM6-NEXT: .long 3456474841 @ 0xce059ed9 +; ARM6-NEXT: .LCPI5_1: +; ARM6-NEXT: .long 790204738 @ 0x2f199142 +; ARM6-NEXT: .LCPI5_2: +; ARM6-NEXT: .long 3175964122 @ 0xbd4d5dda ; ; ARM7-LABEL: test_urem_larger: ; ARM7: @ %bb.0: ; ARM7-NEXT: push {r11, lr} -; ARM7-NEXT: movw r2, #722 -; ARM7-NEXT: bic r1, r1, #-2147483648 -; ARM7-NEXT: movt r2, #18838 -; ARM7-NEXT: mov r3, #0 -; ARM7-NEXT: bl __umoddi3 -; ARM7-NEXT: orr r0, r0, r1 -; ARM7-NEXT: clz r0, r0 -; ARM7-NEXT: lsr r0, r0, #5 +; ARM7-NEXT: movw r12, #40665 +; ARM7-NEXT: movw r2, #37186 +; ARM7-NEXT: movt r12, #52741 +; ARM7-NEXT: movt r2, #12057 +; ARM7-NEXT: umull r3, lr, r0, r12 +; ARM7-NEXT: mla r0, r0, r2, lr +; ARM7-NEXT: mla r0, r1, r12, r0 +; ARM7-NEXT: bic r0, r0, #-2147483648 +; ARM7-NEXT: lsrs r0, r0, #1 +; ARM7-NEXT: rrx r1, r3 +; ARM7-NEXT: orr r0, r0, r3, lsl #30 +; ARM7-NEXT: movw r3, #24026 +; ARM7-NEXT: bic r2, r0, #-2147483648 +; ARM7-NEXT: movt r3, #48461 +; ARM7-NEXT: subs r1, r1, r3 +; ARM7-NEXT: mov r0, #0 +; ARM7-NEXT: sbcs r1, r2, #1 +; ARM7-NEXT: movwlo r0, #1 ; ARM7-NEXT: pop {r11, pc} ; ; ARM8-LABEL: test_urem_larger: ; ARM8: @ %bb.0: ; ARM8-NEXT: push {r11, lr} -; ARM8-NEXT: movw r2, #722 -; ARM8-NEXT: bic r1, r1, #-2147483648 -; ARM8-NEXT: movt r2, #18838 -; ARM8-NEXT: mov r3, #0 -; ARM8-NEXT: bl __umoddi3 -; ARM8-NEXT: orr r0, r0, r1 -; ARM8-NEXT: clz r0, r0 -; ARM8-NEXT: lsr r0, r0, #5 +; ARM8-NEXT: movw r12, #40665 +; ARM8-NEXT: movw r2, #37186 +; ARM8-NEXT: movt r12, #52741 +; ARM8-NEXT: movt r2, #12057 +; ARM8-NEXT: umull r3, lr, r0, r12 +; ARM8-NEXT: mla r0, r0, r2, lr +; ARM8-NEXT: mla r0, r1, r12, r0 +; ARM8-NEXT: bic r0, r0, #-2147483648 +; ARM8-NEXT: lsrs r0, r0, #1 +; ARM8-NEXT: rrx r1, r3 +; ARM8-NEXT: orr r0, r0, r3, lsl #30 +; ARM8-NEXT: movw r3, #24026 +; ARM8-NEXT: bic r2, r0, #-2147483648 +; ARM8-NEXT: movt r3, #48461 +; ARM8-NEXT: subs r1, r1, r3 +; ARM8-NEXT: mov r0, #0 +; ARM8-NEXT: sbcs r1, r2, #1 +; ARM8-NEXT: movwlo r0, #1 ; ARM8-NEXT: pop {r11, pc} ; ; NEON7-LABEL: test_urem_larger: ; NEON7: @ %bb.0: ; NEON7-NEXT: push {r11, lr} -; NEON7-NEXT: movw r2, #722 -; NEON7-NEXT: bic r1, r1, #-2147483648 -; NEON7-NEXT: movt r2, #18838 -; NEON7-NEXT: mov r3, #0 -; NEON7-NEXT: bl __umoddi3 -; NEON7-NEXT: orr r0, r0, r1 -; NEON7-NEXT: clz r0, r0 -; NEON7-NEXT: lsr r0, r0, #5 +; NEON7-NEXT: movw r12, #40665 +; NEON7-NEXT: movw r2, #37186 +; NEON7-NEXT: movt r12, #52741 +; NEON7-NEXT: movt r2, #12057 +; NEON7-NEXT: umull r3, lr, r0, r12 +; NEON7-NEXT: mla r0, r0, r2, lr +; NEON7-NEXT: mla r0, r1, r12, r0 +; NEON7-NEXT: bic r0, r0, #-2147483648 +; NEON7-NEXT: lsrs r0, r0, #1 +; NEON7-NEXT: rrx r1, r3 +; NEON7-NEXT: orr r0, r0, r3, lsl #30 +; NEON7-NEXT: movw r3, #24026 +; NEON7-NEXT: bic r2, r0, #-2147483648 +; NEON7-NEXT: movt r3, #48461 +; NEON7-NEXT: subs r1, r1, r3 +; NEON7-NEXT: mov r0, #0 +; NEON7-NEXT: sbcs r1, r2, #1 +; NEON7-NEXT: movwlo r0, #1 ; NEON7-NEXT: pop {r11, pc} ; ; NEON8-LABEL: test_urem_larger: ; NEON8: @ %bb.0: ; NEON8-NEXT: push {r11, lr} -; NEON8-NEXT: movw r2, #722 -; NEON8-NEXT: bic r1, r1, #-2147483648 -; NEON8-NEXT: movt r2, #18838 -; NEON8-NEXT: mov r3, #0 -; NEON8-NEXT: bl __umoddi3 -; NEON8-NEXT: orr r0, r0, r1 -; NEON8-NEXT: clz r0, r0 -; NEON8-NEXT: lsr r0, r0, #5 +; NEON8-NEXT: movw r12, #40665 +; NEON8-NEXT: movw r2, #37186 +; NEON8-NEXT: movt r12, #52741 +; NEON8-NEXT: movt r2, #12057 +; NEON8-NEXT: umull r3, lr, r0, r12 +; NEON8-NEXT: mla r0, r0, r2, lr +; NEON8-NEXT: mla r0, r1, r12, r0 +; NEON8-NEXT: bic r0, r0, #-2147483648 +; NEON8-NEXT: lsrs r0, r0, #1 +; NEON8-NEXT: rrx r1, r3 +; NEON8-NEXT: orr r0, r0, r3, lsl #30 +; NEON8-NEXT: movw r3, #24026 +; NEON8-NEXT: bic r2, r0, #-2147483648 +; NEON8-NEXT: movt r3, #48461 +; NEON8-NEXT: subs r1, r1, r3 +; NEON8-NEXT: mov r0, #0 +; NEON8-NEXT: sbcs r1, r2, #1 +; NEON8-NEXT: movwlo r0, #1 ; NEON8-NEXT: pop {r11, pc} %urem = urem i63 %X, 1234567890 %cmp = icmp eq i63 %urem, 0 diff --git a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Mips/srem-seteq-illegal-types.ll @@ -5,32 +5,34 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; MIPSEL-LABEL: test_srem_odd: ; MIPSEL: # %bb.0: -; MIPSEL-NEXT: lui $1, 48986 +; MIPSEL-NEXT: lui $1, 8026 ; MIPSEL-NEXT: ori $1, $1, 33099 -; MIPSEL-NEXT: sll $2, $4, 3 -; MIPSEL-NEXT: sra $2, $2, 3 -; MIPSEL-NEXT: mul $1, $2, $1 -; MIPSEL-NEXT: lui $2, 330 -; MIPSEL-NEXT: ori $2, $2, 64874 +; MIPSEL-NEXT: mul $1, $4, $1 +; MIPSEL-NEXT: lui $2, 41 +; MIPSEL-NEXT: ori $2, $2, 24493 ; MIPSEL-NEXT: addu $1, $1, $2 -; MIPSEL-NEXT: lui $2, 661 -; MIPSEL-NEXT: ori $2, $2, 64213 +; MIPSEL-NEXT: lui $2, 8191 +; MIPSEL-NEXT: ori $2, $2, 65535 +; MIPSEL-NEXT: and $1, $1, $2 +; MIPSEL-NEXT: lui $2, 82 +; MIPSEL-NEXT: ori $2, $2, 48987 ; MIPSEL-NEXT: jr $ra ; MIPSEL-NEXT: sltu $2, $1, $2 ; ; MIPS64EL-LABEL: test_srem_odd: ; MIPS64EL: # %bb.0: -; MIPS64EL-NEXT: lui $1, 48986 +; MIPS64EL-NEXT: lui $1, 8026 ; MIPS64EL-NEXT: ori $1, $1, 33099 ; MIPS64EL-NEXT: sll $2, $4, 0 -; MIPS64EL-NEXT: sll $2, $2, 3 -; MIPS64EL-NEXT: sra $2, $2, 3 ; MIPS64EL-NEXT: mul $1, $2, $1 -; MIPS64EL-NEXT: lui $2, 330 -; MIPS64EL-NEXT: ori $2, $2, 64874 +; MIPS64EL-NEXT: lui $2, 41 +; MIPS64EL-NEXT: ori $2, $2, 24493 ; MIPS64EL-NEXT: addu $1, $1, $2 -; MIPS64EL-NEXT: lui $2, 661 -; MIPS64EL-NEXT: ori $2, $2, 64213 +; MIPS64EL-NEXT: lui $2, 8191 +; MIPS64EL-NEXT: ori $2, $2, 65535 +; MIPS64EL-NEXT: and $1, $1, $2 +; MIPS64EL-NEXT: lui $2, 82 +; MIPS64EL-NEXT: ori $2, $2, 48987 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: sltu $2, $1, $2 %srem = srem i29 %X, 99 diff --git a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll @@ -5,26 +5,30 @@ define i1 @test_urem_odd(i13 %X) nounwind { ; MIPSEL-LABEL: test_urem_odd: ; MIPSEL: # %bb.0: -; MIPSEL-NEXT: lui $1, 52428 -; MIPSEL-NEXT: ori $1, $1, 52429 -; MIPSEL-NEXT: andi $2, $4, 8191 -; MIPSEL-NEXT: mul $1, $2, $1 -; MIPSEL-NEXT: lui $2, 13107 -; MIPSEL-NEXT: ori $2, $2, 13108 +; MIPSEL-NEXT: addiu $1, $zero, 3277 +; MIPSEL-NEXT: mul $1, $4, $1 +; MIPSEL-NEXT: andi $1, $1, 8191 ; MIPSEL-NEXT: jr $ra -; MIPSEL-NEXT: sltu $2, $1, $2 +; MIPSEL-NEXT: sltiu $2, $1, 1639 ; ; MIPS64EL-LABEL: test_urem_odd: ; MIPS64EL: # %bb.0: -; MIPS64EL-NEXT: lui $1, 52428 -; MIPS64EL-NEXT: ori $1, $1, 52429 -; MIPS64EL-NEXT: sll $2, $4, 0 -; MIPS64EL-NEXT: andi $2, $2, 8191 -; MIPS64EL-NEXT: mul $1, $2, $1 -; MIPS64EL-NEXT: lui $2, 13107 -; MIPS64EL-NEXT: ori $2, $2, 13108 +; MIPS64EL-NEXT: sll $1, $4, 0 +; MIPS64EL-NEXT: sll $2, $1, 1 +; MIPS64EL-NEXT: addu $2, $2, $1 +; MIPS64EL-NEXT: sll $3, $1, 4 +; MIPS64EL-NEXT: subu $2, $3, $2 +; MIPS64EL-NEXT: sll $3, $1, 6 +; MIPS64EL-NEXT: subu $2, $2, $3 +; MIPS64EL-NEXT: sll $3, $1, 8 +; MIPS64EL-NEXT: addu $2, $3, $2 +; MIPS64EL-NEXT: sll $3, $1, 10 +; MIPS64EL-NEXT: subu $2, $2, $3 +; MIPS64EL-NEXT: sll $1, $1, 12 +; MIPS64EL-NEXT: addu $1, $1, $2 +; MIPS64EL-NEXT: andi $1, $1, 8191 ; MIPS64EL-NEXT: jr $ra -; MIPS64EL-NEXT: sltu $2, $1, $2 +; MIPS64EL-NEXT: sltiu $2, $1, 1639 %urem = urem i13 %X, 5 %cmp = icmp eq i13 %urem, 0 ret i1 %cmp @@ -33,40 +37,40 @@ define i1 @test_urem_even(i27 %X) nounwind { ; MIPSEL-LABEL: test_urem_even: ; MIPSEL: # %bb.0: -; MIPSEL-NEXT: lui $1, 2047 -; MIPSEL-NEXT: ori $1, $1, 65535 -; MIPSEL-NEXT: and $1, $4, $1 -; MIPSEL-NEXT: srl $2, $1, 1 -; MIPSEL-NEXT: lui $3, 37449 -; MIPSEL-NEXT: ori $3, $3, 9363 -; MIPSEL-NEXT: multu $2, $3 -; MIPSEL-NEXT: mfhi $2 -; MIPSEL-NEXT: srl $2, $2, 2 -; MIPSEL-NEXT: sll $3, $2, 4 -; MIPSEL-NEXT: sll $2, $2, 1 -; MIPSEL-NEXT: subu $2, $2, $3 -; MIPSEL-NEXT: addu $1, $1, $2 +; MIPSEL-NEXT: lui $1, 1755 +; MIPSEL-NEXT: ori $1, $1, 28087 +; MIPSEL-NEXT: mul $1, $4, $1 +; MIPSEL-NEXT: sll $2, $1, 26 +; MIPSEL-NEXT: lui $3, 2047 +; MIPSEL-NEXT: ori $4, $3, 65534 +; MIPSEL-NEXT: and $1, $1, $4 +; MIPSEL-NEXT: srl $1, $1, 1 +; MIPSEL-NEXT: or $1, $1, $2 +; MIPSEL-NEXT: ori $2, $3, 65535 +; MIPSEL-NEXT: and $1, $1, $2 +; MIPSEL-NEXT: lui $2, 146 +; MIPSEL-NEXT: ori $2, $2, 18725 ; MIPSEL-NEXT: jr $ra -; MIPSEL-NEXT: sltiu $2, $1, 1 +; MIPSEL-NEXT: sltu $2, $1, $2 ; ; MIPS64EL-LABEL: test_urem_even: ; MIPS64EL: # %bb.0: -; MIPS64EL-NEXT: lui $1, 2047 -; MIPS64EL-NEXT: ori $1, $1, 65535 +; MIPS64EL-NEXT: lui $1, 1755 +; MIPS64EL-NEXT: ori $1, $1, 28087 ; MIPS64EL-NEXT: sll $2, $4, 0 -; MIPS64EL-NEXT: and $1, $2, $1 -; MIPS64EL-NEXT: srl $2, $1, 1 -; MIPS64EL-NEXT: lui $3, 37449 -; MIPS64EL-NEXT: ori $3, $3, 9363 -; MIPS64EL-NEXT: multu $2, $3 -; MIPS64EL-NEXT: mfhi $2 -; MIPS64EL-NEXT: srl $2, $2, 2 -; MIPS64EL-NEXT: sll $3, $2, 4 -; MIPS64EL-NEXT: sll $2, $2, 1 -; MIPS64EL-NEXT: subu $2, $2, $3 -; MIPS64EL-NEXT: addu $1, $1, $2 +; MIPS64EL-NEXT: mul $1, $2, $1 +; MIPS64EL-NEXT: sll $2, $1, 26 +; MIPS64EL-NEXT: lui $3, 2047 +; MIPS64EL-NEXT: ori $4, $3, 65534 +; MIPS64EL-NEXT: and $1, $1, $4 +; MIPS64EL-NEXT: srl $1, $1, 1 +; MIPS64EL-NEXT: or $1, $1, $2 +; MIPS64EL-NEXT: ori $2, $3, 65535 +; MIPS64EL-NEXT: lui $3, 146 +; MIPS64EL-NEXT: and $1, $1, $2 +; MIPS64EL-NEXT: ori $2, $3, 18725 ; MIPS64EL-NEXT: jr $ra -; MIPS64EL-NEXT: sltiu $2, $1, 1 +; MIPS64EL-NEXT: sltu $2, $1, $2 %urem = urem i27 %X, 14 %cmp = icmp eq i27 %urem, 0 ret i1 %cmp @@ -75,24 +79,22 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; MIPSEL-LABEL: test_urem_odd_setne: ; MIPSEL: # %bb.0: -; MIPSEL-NEXT: lui $1, 52428 -; MIPSEL-NEXT: ori $1, $1, 52429 -; MIPSEL-NEXT: andi $2, $4, 15 -; MIPSEL-NEXT: mul $1, $2, $1 -; MIPSEL-NEXT: lui $2, 13107 -; MIPSEL-NEXT: ori $2, $2, 13107 +; MIPSEL-NEXT: sll $1, $4, 1 +; MIPSEL-NEXT: addu $1, $1, $4 +; MIPSEL-NEXT: negu $1, $1 +; MIPSEL-NEXT: andi $1, $1, 15 +; MIPSEL-NEXT: addiu $2, $zero, 3 ; MIPSEL-NEXT: jr $ra ; MIPSEL-NEXT: sltu $2, $2, $1 ; ; MIPS64EL-LABEL: test_urem_odd_setne: ; MIPS64EL: # %bb.0: -; MIPS64EL-NEXT: lui $1, 52428 -; MIPS64EL-NEXT: ori $1, $1, 52429 -; MIPS64EL-NEXT: sll $2, $4, 0 -; MIPS64EL-NEXT: andi $2, $2, 15 -; MIPS64EL-NEXT: mul $1, $2, $1 -; MIPS64EL-NEXT: lui $2, 13107 -; MIPS64EL-NEXT: ori $2, $2, 13107 +; MIPS64EL-NEXT: sll $1, $4, 0 +; MIPS64EL-NEXT: sll $2, $1, 1 +; MIPS64EL-NEXT: addu $1, $2, $1 +; MIPS64EL-NEXT: negu $1, $1 +; MIPS64EL-NEXT: andi $1, $1, 15 +; MIPS64EL-NEXT: addiu $2, $zero, 3 ; MIPS64EL-NEXT: jr $ra ; MIPS64EL-NEXT: sltu $2, $2, $1 %urem = urem i4 %X, 5 @@ -103,26 +105,34 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { ; MIPSEL-LABEL: test_urem_negative_odd: ; MIPSEL: # %bb.0: -; MIPSEL-NEXT: lui $1, 43302 -; MIPSEL-NEXT: ori $1, $1, 57651 -; MIPSEL-NEXT: andi $2, $4, 511 -; MIPSEL-NEXT: mul $1, $2, $1 -; MIPSEL-NEXT: lui $2, 129 -; MIPSEL-NEXT: ori $2, $2, 17191 +; MIPSEL-NEXT: sll $1, $4, 1 +; MIPSEL-NEXT: addu $1, $1, $4 +; MIPSEL-NEXT: sll $2, $4, 4 +; MIPSEL-NEXT: subu $1, $1, $2 +; MIPSEL-NEXT: sll $2, $4, 6 +; MIPSEL-NEXT: addu $1, $2, $1 +; MIPSEL-NEXT: sll $2, $4, 8 +; MIPSEL-NEXT: addu $1, $2, $1 +; MIPSEL-NEXT: andi $1, $1, 511 +; MIPSEL-NEXT: addiu $2, $zero, 1 ; MIPSEL-NEXT: jr $ra ; MIPSEL-NEXT: sltu $2, $2, $1 ; ; MIPS64EL-LABEL: test_urem_negative_odd: ; MIPS64EL: # %bb.0: -; MIPS64EL-NEXT: lui $1, 43302 -; MIPS64EL-NEXT: ori $1, $1, 57651 -; MIPS64EL-NEXT: sll $2, $4, 0 -; MIPS64EL-NEXT: andi $2, $2, 511 -; MIPS64EL-NEXT: mul $1, $2, $1 -; MIPS64EL-NEXT: lui $2, 129 -; MIPS64EL-NEXT: ori $2, $2, 17191 +; MIPS64EL-NEXT: sll $1, $4, 0 +; MIPS64EL-NEXT: sll $2, $1, 1 +; MIPS64EL-NEXT: addu $2, $2, $1 +; MIPS64EL-NEXT: sll $3, $1, 4 +; MIPS64EL-NEXT: subu $2, $2, $3 +; MIPS64EL-NEXT: sll $3, $1, 6 +; MIPS64EL-NEXT: addu $2, $3, $2 +; MIPS64EL-NEXT: sll $1, $1, 8 +; MIPS64EL-NEXT: addiu $3, $zero, 1 +; MIPS64EL-NEXT: addu $1, $1, $2 +; MIPS64EL-NEXT: andi $1, $1, 511 ; MIPS64EL-NEXT: jr $ra -; MIPS64EL-NEXT: sltu $2, $2, $1 +; MIPS64EL-NEXT: sltu $2, $3, $1 %urem = urem i9 %X, -5 %cmp = icmp ne i9 %urem, 0 ret i1 %cmp @@ -142,37 +152,71 @@ ; MIPSEL-NEXT: sw $ra, 36($sp) # 4-byte Folded Spill ; MIPSEL-NEXT: move $7, $6 ; MIPSEL-NEXT: move $6, $5 -; MIPSEL-NEXT: lui $1, 18838 -; MIPSEL-NEXT: ori $1, $1, 722 -; MIPSEL-NEXT: sw $1, 28($sp) -; MIPSEL-NEXT: sw $zero, 24($sp) -; MIPSEL-NEXT: sw $zero, 20($sp) +; MIPSEL-NEXT: move $5, $4 +; MIPSEL-NEXT: lui $1, 12057 +; MIPSEL-NEXT: ori $1, $1, 37186 +; MIPSEL-NEXT: lui $2, 52741 +; MIPSEL-NEXT: ori $2, $2, 40665 +; MIPSEL-NEXT: sw $2, 28($sp) +; MIPSEL-NEXT: sw $1, 24($sp) +; MIPSEL-NEXT: addiu $1, $zero, 2 +; MIPSEL-NEXT: sw $1, 20($sp) ; MIPSEL-NEXT: sw $zero, 16($sp) -; MIPSEL-NEXT: andi $5, $4, 3 -; MIPSEL-NEXT: jal __umodti3 +; MIPSEL-NEXT: jal __multi3 ; MIPSEL-NEXT: addiu $4, $zero, 0 -; MIPSEL-NEXT: or $1, $4, $2 -; MIPSEL-NEXT: or $2, $5, $3 +; MIPSEL-NEXT: sll $1, $4, 31 +; MIPSEL-NEXT: srl $2, $5, 1 ; MIPSEL-NEXT: or $1, $2, $1 -; MIPSEL-NEXT: sltiu $2, $1, 1 +; MIPSEL-NEXT: lui $2, 60010 +; MIPSEL-NEXT: ori $2, $2, 61135 +; MIPSEL-NEXT: sltu $1, $1, $2 +; MIPSEL-NEXT: srl $2, $4, 1 +; MIPSEL-NEXT: andi $3, $3, 3 +; MIPSEL-NEXT: sll $4, $3, 31 +; MIPSEL-NEXT: or $4, $2, $4 +; MIPSEL-NEXT: sltiu $2, $4, 13 +; MIPSEL-NEXT: xori $4, $4, 13 +; MIPSEL-NEXT: movz $2, $1, $4 +; MIPSEL-NEXT: sll $1, $5, 1 +; MIPSEL-NEXT: srl $3, $3, 1 +; MIPSEL-NEXT: or $1, $3, $1 +; MIPSEL-NEXT: andi $1, $1, 3 +; MIPSEL-NEXT: movn $2, $zero, $1 ; MIPSEL-NEXT: lw $ra, 36($sp) # 4-byte Folded Reload ; MIPSEL-NEXT: jr $ra ; MIPSEL-NEXT: addiu $sp, $sp, 40 ; ; MIPS64EL-LABEL: test_urem_oversized: ; MIPS64EL: # %bb.0: -; MIPS64EL-NEXT: daddiu $sp, $sp, -16 -; MIPS64EL-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill -; MIPS64EL-NEXT: andi $5, $5, 3 -; MIPS64EL-NEXT: lui $1, 18838 -; MIPS64EL-NEXT: ori $6, $1, 722 -; MIPS64EL-NEXT: jal __umodti3 -; MIPS64EL-NEXT: daddiu $7, $zero, 0 -; MIPS64EL-NEXT: or $1, $2, $3 -; MIPS64EL-NEXT: sltiu $2, $1, 1 -; MIPS64EL-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload +; MIPS64EL-NEXT: lui $1, 6029 +; MIPS64EL-NEXT: daddiu $1, $1, -14175 +; MIPS64EL-NEXT: dsll $1, $1, 16 +; MIPS64EL-NEXT: daddiu $1, $1, 26371 +; MIPS64EL-NEXT: dsll $1, $1, 17 +; MIPS64EL-NEXT: daddiu $1, $1, -24871 +; MIPS64EL-NEXT: dmult $5, $1 +; MIPS64EL-NEXT: mflo $2 +; MIPS64EL-NEXT: dmultu $4, $1 +; MIPS64EL-NEXT: mflo $1 +; MIPS64EL-NEXT: mfhi $3 +; MIPS64EL-NEXT: lui $5, 14 +; MIPS64EL-NEXT: daddiu $5, $5, -5525 +; MIPS64EL-NEXT: dsll $5, $5, 16 +; MIPS64EL-NEXT: daddiu $5, $5, -4401 +; MIPS64EL-NEXT: dsll $4, $4, 1 +; MIPS64EL-NEXT: daddu $3, $3, $4 +; MIPS64EL-NEXT: daddu $2, $3, $2 +; MIPS64EL-NEXT: andi $3, $2, 3 +; MIPS64EL-NEXT: dsll $2, $3, 63 +; MIPS64EL-NEXT: dsrl $4, $1, 1 +; MIPS64EL-NEXT: or $2, $4, $2 +; MIPS64EL-NEXT: sltu $2, $2, $5 +; MIPS64EL-NEXT: dsrl $3, $3, 1 +; MIPS64EL-NEXT: dsll $1, $1, 1 +; MIPS64EL-NEXT: or $1, $3, $1 +; MIPS64EL-NEXT: andi $1, $1, 3 ; MIPS64EL-NEXT: jr $ra -; MIPS64EL-NEXT: daddiu $sp, $sp, 16 +; MIPS64EL-NEXT: movn $2, $zero, $1 %urem = urem i66 %X, 1234567890 %cmp = icmp eq i66 %urem, 0 ret i1 %cmp diff --git a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/srem-seteq-illegal-types.ll @@ -5,36 +5,37 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; PPC-LABEL: test_srem_odd: ; PPC: # %bb.0: -; PPC-NEXT: lis 4, -23170 -; PPC-NEXT: slwi 3, 3, 3 -; PPC-NEXT: ori 4, 4, 46339 -; PPC-NEXT: srawi 3, 3, 3 -; PPC-NEXT: mulhw 4, 3, 4 -; PPC-NEXT: add 4, 4, 3 -; PPC-NEXT: srwi 5, 4, 31 -; PPC-NEXT: srawi 4, 4, 6 -; PPC-NEXT: add 4, 4, 5 -; PPC-NEXT: mulli 4, 4, 99 -; PPC-NEXT: sub 3, 3, 4 -; PPC-NEXT: cntlzw 3, 3 -; PPC-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC-NEXT: lis 4, 8026 +; PPC-NEXT: ori 4, 4, 33099 +; PPC-NEXT: mullw 3, 3, 4 +; PPC-NEXT: addi 3, 3, 24493 +; PPC-NEXT: lis 4, 82 +; PPC-NEXT: addis 3, 3, 41 +; PPC-NEXT: ori 4, 4, 48987 +; PPC-NEXT: clrlwi 3, 3, 3 +; PPC-NEXT: cmplw 3, 4 +; PPC-NEXT: li 3, 0 +; PPC-NEXT: li 4, 1 +; PPC-NEXT: bc 12, 0, .LBB0_1 +; PPC-NEXT: blr +; PPC-NEXT: .LBB0_1: +; PPC-NEXT: addi 3, 4, 0 ; PPC-NEXT: blr ; ; PPC64LE-LABEL: test_srem_odd: ; PPC64LE: # %bb.0: -; PPC64LE-NEXT: lis 4, -23170 -; PPC64LE-NEXT: slwi 3, 3, 3 -; PPC64LE-NEXT: srawi 3, 3, 3 -; PPC64LE-NEXT: ori 4, 4, 46339 -; PPC64LE-NEXT: mulhw 4, 3, 4 -; PPC64LE-NEXT: add 4, 4, 3 -; PPC64LE-NEXT: srwi 5, 4, 31 -; PPC64LE-NEXT: srawi 4, 4, 6 -; PPC64LE-NEXT: add 4, 4, 5 -; PPC64LE-NEXT: mulli 4, 4, 99 -; PPC64LE-NEXT: sub 3, 3, 4 -; PPC64LE-NEXT: cntlzw 3, 3 -; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC64LE-NEXT: lis 4, 8026 +; PPC64LE-NEXT: ori 4, 4, 33099 +; PPC64LE-NEXT: mullw 3, 3, 4 +; PPC64LE-NEXT: lis 4, 82 +; PPC64LE-NEXT: ori 4, 4, 48987 +; PPC64LE-NEXT: addi 3, 3, 24493 +; PPC64LE-NEXT: addis 3, 3, 41 +; PPC64LE-NEXT: clrlwi 3, 3, 3 +; PPC64LE-NEXT: cmplw 3, 4 +; PPC64LE-NEXT: li 3, 0 +; PPC64LE-NEXT: li 4, 1 +; PPC64LE-NEXT: isellt 3, 4, 3 ; PPC64LE-NEXT: blr %srem = srem i29 %X, 99 %cmp = icmp eq i29 %srem, 0 diff --git a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll @@ -5,29 +5,24 @@ define i1 @test_urem_odd(i13 %X) nounwind { ; PPC-LABEL: test_urem_odd: ; PPC: # %bb.0: -; PPC-NEXT: lis 4, -13108 +; PPC-NEXT: mulli 3, 3, 3277 ; PPC-NEXT: clrlwi 3, 3, 19 -; PPC-NEXT: ori 4, 4, 52429 -; PPC-NEXT: mulhwu 4, 3, 4 -; PPC-NEXT: srwi 4, 4, 2 -; PPC-NEXT: mulli 4, 4, 5 -; PPC-NEXT: sub 3, 3, 4 -; PPC-NEXT: cntlzw 3, 3 -; PPC-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC-NEXT: li 4, 0 +; PPC-NEXT: cmplwi 3, 1639 +; PPC-NEXT: li 3, 1 +; PPC-NEXT: bclr 12, 0, 0 +; PPC-NEXT: # %bb.1: +; PPC-NEXT: ori 3, 4, 0 ; PPC-NEXT: blr ; ; PPC64LE-LABEL: test_urem_odd: ; PPC64LE: # %bb.0: -; PPC64LE-NEXT: lis 4, -13108 +; PPC64LE-NEXT: mulli 3, 3, 3277 +; PPC64LE-NEXT: li 4, 0 ; PPC64LE-NEXT: clrlwi 3, 3, 19 -; PPC64LE-NEXT: ori 4, 4, 52429 -; PPC64LE-NEXT: mulhwu 4, 3, 4 -; PPC64LE-NEXT: rlwinm 5, 4, 0, 0, 29 -; PPC64LE-NEXT: srwi 4, 4, 2 -; PPC64LE-NEXT: add 4, 4, 5 -; PPC64LE-NEXT: sub 3, 3, 4 -; PPC64LE-NEXT: cntlzw 3, 3 -; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC64LE-NEXT: cmplwi 3, 1639 +; PPC64LE-NEXT: li 3, 1 +; PPC64LE-NEXT: isellt 3, 3, 4 ; PPC64LE-NEXT: blr %urem = urem i13 %X, 5 %cmp = icmp eq i13 %urem, 0 @@ -37,30 +32,35 @@ define i1 @test_urem_even(i27 %X) nounwind { ; PPC-LABEL: test_urem_even: ; PPC: # %bb.0: -; PPC-NEXT: lis 4, -28087 -; PPC-NEXT: rlwinm 5, 3, 31, 6, 31 -; PPC-NEXT: ori 4, 4, 9363 -; PPC-NEXT: mulhwu 4, 5, 4 -; PPC-NEXT: srwi 4, 4, 2 -; PPC-NEXT: clrlwi 3, 3, 5 -; PPC-NEXT: mulli 4, 4, 14 -; PPC-NEXT: sub 3, 3, 4 -; PPC-NEXT: cntlzw 3, 3 -; PPC-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC-NEXT: lis 4, 1755 +; PPC-NEXT: ori 4, 4, 28087 +; PPC-NEXT: mullw 3, 3, 4 +; PPC-NEXT: rlwinm 4, 3, 31, 6, 31 +; PPC-NEXT: rlwimi 4, 3, 26, 5, 5 +; PPC-NEXT: lis 3, 146 +; PPC-NEXT: ori 3, 3, 18725 +; PPC-NEXT: cmplw 4, 3 +; PPC-NEXT: li 3, 0 +; PPC-NEXT: li 4, 1 +; PPC-NEXT: bc 12, 0, .LBB1_1 +; PPC-NEXT: blr +; PPC-NEXT: .LBB1_1: +; PPC-NEXT: addi 3, 4, 0 ; PPC-NEXT: blr ; ; PPC64LE-LABEL: test_urem_even: ; PPC64LE: # %bb.0: -; PPC64LE-NEXT: lis 4, -28087 +; PPC64LE-NEXT: lis 4, 1755 +; PPC64LE-NEXT: ori 4, 4, 28087 +; PPC64LE-NEXT: mullw 3, 3, 4 +; PPC64LE-NEXT: lis 4, 146 ; PPC64LE-NEXT: rlwinm 5, 3, 31, 6, 31 -; PPC64LE-NEXT: clrlwi 3, 3, 5 -; PPC64LE-NEXT: ori 4, 4, 9363 -; PPC64LE-NEXT: mulhwu 4, 5, 4 -; PPC64LE-NEXT: srwi 4, 4, 2 -; PPC64LE-NEXT: mulli 4, 4, 14 -; PPC64LE-NEXT: sub 3, 3, 4 -; PPC64LE-NEXT: cntlzw 3, 3 -; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC64LE-NEXT: rlwimi 5, 3, 26, 5, 5 +; PPC64LE-NEXT: ori 3, 4, 18725 +; PPC64LE-NEXT: li 4, 1 +; PPC64LE-NEXT: cmplw 5, 3 +; PPC64LE-NEXT: li 3, 0 +; PPC64LE-NEXT: isellt 3, 4, 3 ; PPC64LE-NEXT: blr %urem = urem i27 %X, 14 %cmp = icmp eq i27 %urem, 0 @@ -70,30 +70,26 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; PPC-LABEL: test_urem_odd_setne: ; PPC: # %bb.0: -; PPC-NEXT: lis 4, -13108 +; PPC-NEXT: mulli 3, 3, 13 ; PPC-NEXT: clrlwi 3, 3, 28 -; PPC-NEXT: ori 4, 4, 52429 -; PPC-NEXT: mulhwu 4, 3, 4 -; PPC-NEXT: srwi 4, 4, 2 -; PPC-NEXT: mulli 4, 4, 5 -; PPC-NEXT: sub 3, 3, 4 -; PPC-NEXT: cntlzw 3, 3 -; PPC-NEXT: not 3, 3 -; PPC-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC-NEXT: li 4, 0 +; PPC-NEXT: cmplwi 3, 3 +; PPC-NEXT: li 3, 1 +; PPC-NEXT: bclr 12, 1, 0 +; PPC-NEXT: # %bb.1: +; PPC-NEXT: ori 3, 4, 0 ; PPC-NEXT: blr ; ; PPC64LE-LABEL: test_urem_odd_setne: ; PPC64LE: # %bb.0: -; PPC64LE-NEXT: lis 4, -13108 +; PPC64LE-NEXT: slwi 5, 3, 1 +; PPC64LE-NEXT: li 4, 0 +; PPC64LE-NEXT: add 3, 3, 5 +; PPC64LE-NEXT: neg 3, 3 ; PPC64LE-NEXT: clrlwi 3, 3, 28 -; PPC64LE-NEXT: ori 4, 4, 52429 -; PPC64LE-NEXT: mulhwu 4, 3, 4 -; PPC64LE-NEXT: srwi 4, 4, 2 -; PPC64LE-NEXT: rlwimi 4, 4, 2, 28, 29 -; PPC64LE-NEXT: sub 3, 3, 4 -; PPC64LE-NEXT: cntlzw 3, 3 -; PPC64LE-NEXT: not 3, 3 -; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC64LE-NEXT: cmplwi 3, 3 +; PPC64LE-NEXT: li 3, 1 +; PPC64LE-NEXT: iselgt 3, 3, 4 ; PPC64LE-NEXT: blr %urem = urem i4 %X, 5 %cmp = icmp ne i4 %urem, 0 @@ -103,30 +99,24 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { ; PPC-LABEL: test_urem_negative_odd: ; PPC: # %bb.0: -; PPC-NEXT: lis 4, 8272 +; PPC-NEXT: mulli 3, 3, 307 ; PPC-NEXT: clrlwi 3, 3, 23 -; PPC-NEXT: ori 4, 4, 51705 -; PPC-NEXT: mulhwu 4, 3, 4 -; PPC-NEXT: srwi 4, 4, 6 -; PPC-NEXT: mulli 4, 4, 507 -; PPC-NEXT: sub 3, 3, 4 -; PPC-NEXT: cntlzw 3, 3 -; PPC-NEXT: not 3, 3 -; PPC-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC-NEXT: li 4, 0 +; PPC-NEXT: cmplwi 3, 1 +; PPC-NEXT: li 3, 1 +; PPC-NEXT: bclr 12, 1, 0 +; PPC-NEXT: # %bb.1: +; PPC-NEXT: ori 3, 4, 0 ; PPC-NEXT: blr ; ; PPC64LE-LABEL: test_urem_negative_odd: ; PPC64LE: # %bb.0: -; PPC64LE-NEXT: lis 4, 8272 +; PPC64LE-NEXT: mulli 3, 3, 307 +; PPC64LE-NEXT: li 4, 0 ; PPC64LE-NEXT: clrlwi 3, 3, 23 -; PPC64LE-NEXT: ori 4, 4, 51705 -; PPC64LE-NEXT: mulhwu 4, 3, 4 -; PPC64LE-NEXT: srwi 4, 4, 6 -; PPC64LE-NEXT: mulli 4, 4, 507 -; PPC64LE-NEXT: sub 3, 3, 4 -; PPC64LE-NEXT: cntlzw 3, 3 -; PPC64LE-NEXT: not 3, 3 -; PPC64LE-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC64LE-NEXT: cmplwi 3, 1 +; PPC64LE-NEXT: li 3, 1 +; PPC64LE-NEXT: iselgt 3, 3, 4 ; PPC64LE-NEXT: blr %urem = urem i9 %X, -5 %cmp = icmp ne i9 %urem, 0 @@ -136,103 +126,79 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; PPC-LABEL: test_urem_vec: ; PPC: # %bb.0: -; PPC-NEXT: lis 6, -31983 -; PPC-NEXT: clrlwi 5, 5, 21 -; PPC-NEXT: ori 6, 6, 60211 -; PPC-NEXT: mullw 5, 5, 6 -; PPC-NEXT: lis 6, 32 -; PPC-NEXT: addi 5, 5, 10650 -; PPC-NEXT: ori 6, 6, 5132 -; PPC-NEXT: addis 5, 5, -1572 -; PPC-NEXT: cmplw 5, 6 -; PPC-NEXT: lis 6, -18725 +; PPC-NEXT: mulli 3, 3, 683 +; PPC-NEXT: rlwinm 7, 3, 31, 22, 31 +; PPC-NEXT: rlwimi 7, 3, 10, 21, 21 +; PPC-NEXT: mulli 5, 5, 819 +; PPC-NEXT: li 6, 0 +; PPC-NEXT: cmplwi 7, 341 +; PPC-NEXT: mulli 3, 4, 1463 +; PPC-NEXT: addi 4, 5, -1638 +; PPC-NEXT: addi 3, 3, -1463 ; PPC-NEXT: clrlwi 4, 4, 21 -; PPC-NEXT: ori 6, 6, 28087 -; PPC-NEXT: lis 5, -21846 -; PPC-NEXT: mullw 4, 4, 6 -; PPC-NEXT: lis 6, 9362 ; PPC-NEXT: clrlwi 3, 3, 21 -; PPC-NEXT: ori 5, 5, 43691 -; PPC-NEXT: addi 4, 4, -28087 -; PPC-NEXT: ori 6, 6, 18724 -; PPC-NEXT: mulhwu 5, 3, 5 -; PPC-NEXT: addis 4, 4, 18725 -; PPC-NEXT: cmplw 1, 4, 6 -; PPC-NEXT: srwi 4, 5, 2 -; PPC-NEXT: li 6, 0 -; PPC-NEXT: li 7, 1 -; PPC-NEXT: mulli 4, 4, 6 -; PPC-NEXT: sub 3, 3, 4 -; PPC-NEXT: cntlzw 3, 3 -; PPC-NEXT: not 3, 3 -; PPC-NEXT: bc 12, 5, .LBB4_2 +; PPC-NEXT: cmplwi 1, 4, 1 +; PPC-NEXT: cmplwi 5, 3, 292 +; PPC-NEXT: li 3, 1 +; PPC-NEXT: bc 12, 21, .LBB4_2 ; PPC-NEXT: # %bb.1: ; PPC-NEXT: ori 4, 6, 0 ; PPC-NEXT: b .LBB4_3 ; PPC-NEXT: .LBB4_2: -; PPC-NEXT: addi 4, 7, 0 +; PPC-NEXT: addi 4, 3, 0 ; PPC-NEXT: .LBB4_3: -; PPC-NEXT: bc 12, 1, .LBB4_5 +; PPC-NEXT: bc 12, 5, .LBB4_5 ; PPC-NEXT: # %bb.4: ; PPC-NEXT: ori 5, 6, 0 ; PPC-NEXT: b .LBB4_6 ; PPC-NEXT: .LBB4_5: -; PPC-NEXT: addi 5, 7, 0 +; PPC-NEXT: addi 5, 3, 0 ; PPC-NEXT: .LBB4_6: -; PPC-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC-NEXT: bclr 12, 1, 0 +; PPC-NEXT: # %bb.7: +; PPC-NEXT: ori 3, 6, 0 ; PPC-NEXT: blr ; ; PPC64LE-LABEL: test_urem_vec: ; PPC64LE: # %bb.0: -; PPC64LE-NEXT: lis 6, 9362 -; PPC64LE-NEXT: lis 7, -21846 -; PPC64LE-NEXT: clrlwi 4, 4, 21 -; PPC64LE-NEXT: clrlwi 3, 3, 21 -; PPC64LE-NEXT: lis 8, 160 -; PPC64LE-NEXT: clrlwi 5, 5, 21 -; PPC64LE-NEXT: ori 6, 6, 18725 -; PPC64LE-NEXT: ori 7, 7, 43691 -; PPC64LE-NEXT: ori 8, 8, 25663 -; PPC64LE-NEXT: vspltisw 4, -11 -; PPC64LE-NEXT: mulhwu 6, 4, 6 -; PPC64LE-NEXT: mulhwu 7, 3, 7 -; PPC64LE-NEXT: mulhwu 8, 5, 8 -; PPC64LE-NEXT: sub 9, 4, 6 -; PPC64LE-NEXT: srwi 7, 7, 2 -; PPC64LE-NEXT: srwi 9, 9, 1 -; PPC64LE-NEXT: mulli 7, 7, 6 -; PPC64LE-NEXT: add 6, 9, 6 -; PPC64LE-NEXT: srwi 9, 6, 2 -; PPC64LE-NEXT: rlwinm 6, 6, 1, 0, 28 -; PPC64LE-NEXT: sub 6, 9, 6 -; PPC64LE-NEXT: sub 9, 5, 8 -; PPC64LE-NEXT: add 4, 4, 6 -; PPC64LE-NEXT: srwi 6, 9, 1 -; PPC64LE-NEXT: sub 3, 3, 7 -; PPC64LE-NEXT: add 6, 6, 8 -; PPC64LE-NEXT: mtvsrwz 34, 4 -; PPC64LE-NEXT: srwi 4, 6, 10 -; PPC64LE-NEXT: mtvsrwz 35, 3 -; PPC64LE-NEXT: mulli 3, 4, 2043 -; PPC64LE-NEXT: addis 4, 2, .LCPI4_0@toc@ha -; PPC64LE-NEXT: vmrghw 2, 2, 3 -; PPC64LE-NEXT: addi 4, 4, .LCPI4_0@toc@l -; PPC64LE-NEXT: lvx 3, 0, 4 -; PPC64LE-NEXT: sub 3, 5, 3 -; PPC64LE-NEXT: mtvsrwz 37, 3 +; PPC64LE-NEXT: mtvsrwz 34, 3 +; PPC64LE-NEXT: addis 3, 2, .LCPI4_0@toc@ha +; PPC64LE-NEXT: mtvsrwz 35, 4 +; PPC64LE-NEXT: addi 3, 3, .LCPI4_0@toc@l +; PPC64LE-NEXT: addis 4, 2, .LCPI4_2@toc@ha +; PPC64LE-NEXT: mtvsrwz 36, 5 +; PPC64LE-NEXT: vmrghw 2, 3, 2 +; PPC64LE-NEXT: lvx 3, 0, 3 ; PPC64LE-NEXT: addis 3, 2, .LCPI4_1@toc@ha ; PPC64LE-NEXT: addi 3, 3, .LCPI4_1@toc@l -; PPC64LE-NEXT: vperm 2, 5, 2, 3 -; PPC64LE-NEXT: vsrw 3, 4, 4 +; PPC64LE-NEXT: vperm 2, 4, 2, 3 +; PPC64LE-NEXT: vspltisw 3, -11 ; PPC64LE-NEXT: lvx 4, 0, 3 -; PPC64LE-NEXT: xxland 34, 34, 35 -; PPC64LE-NEXT: vcmpequw 2, 2, 4 -; PPC64LE-NEXT: xxlnor 0, 34, 34 -; PPC64LE-NEXT: xxswapd 1, 0 -; PPC64LE-NEXT: xxsldwi 2, 0, 0, 1 -; PPC64LE-NEXT: mffprwz 5, 0 -; PPC64LE-NEXT: mffprwz 3, 1 -; PPC64LE-NEXT: mffprwz 4, 2 +; PPC64LE-NEXT: addi 3, 4, .LCPI4_2@toc@l +; PPC64LE-NEXT: addis 4, 2, .LCPI4_4@toc@ha +; PPC64LE-NEXT: lvx 5, 0, 3 +; PPC64LE-NEXT: addis 3, 2, .LCPI4_3@toc@ha +; PPC64LE-NEXT: addi 4, 4, .LCPI4_4@toc@l +; PPC64LE-NEXT: addi 3, 3, .LCPI4_3@toc@l +; PPC64LE-NEXT: vsrw 3, 3, 3 +; PPC64LE-NEXT: vsubuwm 2, 2, 4 +; PPC64LE-NEXT: lvx 4, 0, 3 +; PPC64LE-NEXT: addis 3, 2, .LCPI4_5@toc@ha +; PPC64LE-NEXT: addi 3, 3, .LCPI4_5@toc@l +; PPC64LE-NEXT: vmuluwm 2, 2, 5 +; PPC64LE-NEXT: lvx 5, 0, 4 +; PPC64LE-NEXT: xxland 32, 34, 35 +; PPC64LE-NEXT: vslw 2, 2, 4 +; PPC64LE-NEXT: vsrw 4, 0, 5 +; PPC64LE-NEXT: xxlor 0, 36, 34 +; PPC64LE-NEXT: lvx 2, 0, 3 +; PPC64LE-NEXT: xxland 35, 0, 35 +; PPC64LE-NEXT: vcmpgtuw 2, 3, 2 +; PPC64LE-NEXT: xxswapd 0, 34 +; PPC64LE-NEXT: xxsldwi 1, 34, 34, 1 +; PPC64LE-NEXT: mfvsrwz 5, 34 +; PPC64LE-NEXT: mffprwz 3, 0 +; PPC64LE-NEXT: mffprwz 4, 1 ; PPC64LE-NEXT: blr %urem = urem <3 x i11> %X, %cmp = icmp ne <3 x i11> %urem, @@ -247,19 +213,35 @@ ; PPC-NEXT: stwu 1, -16(1) ; PPC-NEXT: mr 6, 5 ; PPC-NEXT: mr 5, 4 -; PPC-NEXT: clrlwi 4, 3, 30 -; PPC-NEXT: lis 3, 18838 -; PPC-NEXT: ori 10, 3, 722 +; PPC-NEXT: mr 4, 3 +; PPC-NEXT: lis 3, 12057 +; PPC-NEXT: lis 7, -12795 +; PPC-NEXT: ori 9, 3, 37186 +; PPC-NEXT: ori 10, 7, 40665 ; PPC-NEXT: li 3, 0 ; PPC-NEXT: li 7, 0 -; PPC-NEXT: li 8, 0 -; PPC-NEXT: li 9, 0 -; PPC-NEXT: bl __umodti3 -; PPC-NEXT: or 3, 5, 3 -; PPC-NEXT: or 4, 6, 4 -; PPC-NEXT: or 3, 4, 3 -; PPC-NEXT: cntlzw 3, 3 -; PPC-NEXT: rlwinm 3, 3, 27, 31, 31 +; PPC-NEXT: li 8, 2 +; PPC-NEXT: bl __multi3 +; PPC-NEXT: rotlwi 7, 6, 31 +; PPC-NEXT: lis 3, -5526 +; PPC-NEXT: rlwimi 7, 5, 31, 0, 0 +; PPC-NEXT: rotlwi 5, 5, 31 +; PPC-NEXT: rlwimi 5, 4, 31, 0, 0 +; PPC-NEXT: ori 3, 3, 61135 +; PPC-NEXT: cmplwi 1, 5, 13 +; PPC-NEXT: cmplw 7, 3 +; PPC-NEXT: rlwinm 4, 4, 31, 31, 31 +; PPC-NEXT: crand 20, 6, 0 +; PPC-NEXT: crandc 21, 4, 6 +; PPC-NEXT: rlwimi. 4, 6, 1, 30, 30 +; PPC-NEXT: cror 20, 20, 21 +; PPC-NEXT: crnand 20, 2, 20 +; PPC-NEXT: li 3, 1 +; PPC-NEXT: bc 12, 20, .LBB5_1 +; PPC-NEXT: b .LBB5_2 +; PPC-NEXT: .LBB5_1: +; PPC-NEXT: li 3, 0 +; PPC-NEXT: .LBB5_2: ; PPC-NEXT: lwz 0, 20(1) ; PPC-NEXT: addi 1, 1, 16 ; PPC-NEXT: mtlr 0 @@ -267,21 +249,28 @@ ; ; PPC64LE-LABEL: test_urem_oversized: ; PPC64LE: # %bb.0: -; PPC64LE-NEXT: mflr 0 -; PPC64LE-NEXT: std 0, 16(1) -; PPC64LE-NEXT: stdu 1, -32(1) -; PPC64LE-NEXT: lis 5, 18838 -; PPC64LE-NEXT: clrldi 4, 4, 62 -; PPC64LE-NEXT: li 6, 0 -; PPC64LE-NEXT: ori 5, 5, 722 -; PPC64LE-NEXT: bl __umodti3 -; PPC64LE-NEXT: nop -; PPC64LE-NEXT: or 3, 3, 4 -; PPC64LE-NEXT: cntlzd 3, 3 -; PPC64LE-NEXT: rldicl 3, 3, 58, 63 -; PPC64LE-NEXT: addi 1, 1, 32 -; PPC64LE-NEXT: ld 0, 16(1) -; PPC64LE-NEXT: mtlr 0 +; PPC64LE-NEXT: lis 5, 6028 +; PPC64LE-NEXT: ori 5, 5, 51361 +; PPC64LE-NEXT: rldic 5, 5, 33, 2 +; PPC64LE-NEXT: oris 5, 5, 52741 +; PPC64LE-NEXT: ori 5, 5, 40665 +; PPC64LE-NEXT: mulhdu 6, 3, 5 +; PPC64LE-NEXT: mulld 4, 4, 5 +; PPC64LE-NEXT: mulld 5, 3, 5 +; PPC64LE-NEXT: sldi 3, 3, 1 +; PPC64LE-NEXT: add 3, 6, 3 +; PPC64LE-NEXT: add 3, 3, 4 +; PPC64LE-NEXT: lis 4, -8538 +; PPC64LE-NEXT: rotldi 6, 5, 63 +; PPC64LE-NEXT: ori 4, 4, 44780 +; PPC64LE-NEXT: rldimi 6, 3, 63, 0 +; PPC64LE-NEXT: rlwinm 3, 3, 31, 31, 31 +; PPC64LE-NEXT: rldicl 4, 4, 4, 28 +; PPC64LE-NEXT: rlwimi. 3, 5, 1, 30, 30 +; PPC64LE-NEXT: cmpld 1, 6, 4 +; PPC64LE-NEXT: li 3, 1 +; PPC64LE-NEXT: crnand 20, 2, 4 +; PPC64LE-NEXT: isel 3, 0, 3, 20 ; PPC64LE-NEXT: blr %urem = urem i66 %X, 1234567890 %cmp = icmp eq i66 %urem, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll --- a/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmulh-sdnode-rv32.ll @@ -2,22 +2,23 @@ ; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s ; Test that the prepareSREMEqFold optimization doesn't crash on scalable -; vector types. RVV doesn't have ROTR or ROTL operations so the optimization -; itself doesn't kick in. +; vector types. define @srem_eq_fold_nxv4i8( %va) { ; CHECK-LABEL: srem_eq_fold_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, zero, 43 +; CHECK-NEXT: addi a0, zero, -85 ; CHECK-NEXT: vsetvli a1, zero, e8,mf2,ta,mu -; CHECK-NEXT: vmulh.vx v25, v8, a0 -; CHECK-NEXT: vadd.vi v25, v25, 0 -; CHECK-NEXT: vsrl.vi v26, v25, 7 -; CHECK-NEXT: vand.vi v26, v26, -1 -; CHECK-NEXT: vadd.vv v25, v25, v26 -; CHECK-NEXT: addi a0, zero, 6 -; CHECK-NEXT: vmul.vx v25, v25, a0 -; CHECK-NEXT: vsub.vv v25, v8, v25 -; CHECK-NEXT: vmseq.vi v0, v25, 0 +; CHECK-NEXT: vmul.vx v25, v8, a0 +; CHECK-NEXT: addi a0, zero, 42 +; CHECK-NEXT: vadd.vx v25, v25, a0 +; CHECK-NEXT: vmv.v.i v26, 1 +; CHECK-NEXT: vrsub.vi v27, v26, 0 +; CHECK-NEXT: vand.vi v27, v27, 7 +; CHECK-NEXT: vsll.vv v27, v25, v27 +; CHECK-NEXT: vand.vi v26, v26, 7 +; CHECK-NEXT: vsrl.vv v25, v25, v26 +; CHECK-NEXT: vor.vv v25, v25, v27 +; CHECK-NEXT: vmsleu.vx v0, v25, a0 ; CHECK-NEXT: ret %head_six = insertelement undef, i8 6, i32 0 %splat_six = shufflevector %head_six, undef, zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -11,11 +11,18 @@ ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: srai a0, a0, 3 -; RV32-NEXT: addi a1, zero, 99 -; RV32-NEXT: call __modsi3@plt -; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: lui a1, 128424 +; RV32-NEXT: addi a1, a1, 331 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: lui a1, 662 +; RV32-NEXT: addi a1, a1, -83 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: lui a1, 131072 +; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: lui a1, 1324 +; RV32-NEXT: addi a1, a1, -165 +; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -24,100 +31,83 @@ ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: slli a0, a0, 35 -; RV64-NEXT: srai a0, a0, 35 -; RV64-NEXT: addi a1, zero, 99 -; RV64-NEXT: call __moddi3@plt -; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: lui a1, 128424 +; RV64-NEXT: addiw a1, a1, 331 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: lui a1, 662 +; RV64-NEXT: addiw a1, a1, -83 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: lui a1, 131072 +; RV64-NEXT: addiw a1, a1, -1 +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: lui a1, 1324 +; RV64-NEXT: addiw a1, a1, -165 +; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; RV32M-LABEL: test_srem_odd: ; RV32M: # %bb.0: -; RV32M-NEXT: slli a0, a0, 3 -; RV32M-NEXT: srai a0, a0, 3 -; RV32M-NEXT: lui a1, 783784 +; RV32M-NEXT: lui a1, 128424 ; RV32M-NEXT: addi a1, a1, 331 ; RV32M-NEXT: mul a0, a0, a1 -; RV32M-NEXT: lui a1, 5296 -; RV32M-NEXT: addi a1, a1, -662 +; RV32M-NEXT: lui a1, 662 +; RV32M-NEXT: addi a1, a1, -83 ; RV32M-NEXT: add a0, a0, a1 -; RV32M-NEXT: lui a1, 10592 -; RV32M-NEXT: addi a1, a1, -1323 +; RV32M-NEXT: lui a1, 131072 +; RV32M-NEXT: addi a1, a1, -1 +; RV32M-NEXT: and a0, a0, a1 +; RV32M-NEXT: lui a1, 1324 +; RV32M-NEXT: addi a1, a1, -165 ; RV32M-NEXT: sltu a0, a0, a1 ; RV32M-NEXT: ret ; ; RV64M-LABEL: test_srem_odd: ; RV64M: # %bb.0: -; RV64M-NEXT: slli a0, a0, 35 -; RV64M-NEXT: srai a0, a0, 35 -; RV64M-NEXT: lui a1, 1048536 -; RV64M-NEXT: addiw a1, a1, -331 -; RV64M-NEXT: slli a1, a1, 15 -; RV64M-NEXT: addi a1, a1, 331 -; RV64M-NEXT: slli a1, a1, 15 -; RV64M-NEXT: addi a1, a1, -331 -; RV64M-NEXT: slli a1, a1, 15 -; RV64M-NEXT: addi a1, a1, 331 +; RV64M-NEXT: lui a1, 128424 +; RV64M-NEXT: addiw a1, a1, 331 ; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: lui a1, 331 -; RV64M-NEXT: addiw a1, a1, -41 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, -1531 -; RV64M-NEXT: slli a2, a1, 12 -; RV64M-NEXT: addi a2, a2, 703 -; RV64M-NEXT: slli a2, a2, 12 -; RV64M-NEXT: addi a2, a2, 1448 -; RV64M-NEXT: add a0, a0, a2 -; RV64M-NEXT: slli a1, a1, 13 -; RV64M-NEXT: addi a1, a1, 1407 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, -1199 +; RV64M-NEXT: lui a1, 662 +; RV64M-NEXT: addiw a1, a1, -83 +; RV64M-NEXT: add a0, a0, a1 +; RV64M-NEXT: lui a1, 131072 +; RV64M-NEXT: addiw a1, a1, -1 +; RV64M-NEXT: and a0, a0, a1 +; RV64M-NEXT: lui a1, 1324 +; RV64M-NEXT: addiw a1, a1, -165 ; RV64M-NEXT: sltu a0, a0, a1 ; RV64M-NEXT: ret ; ; RV32MV-LABEL: test_srem_odd: ; RV32MV: # %bb.0: -; RV32MV-NEXT: slli a0, a0, 3 -; RV32MV-NEXT: srai a0, a0, 3 -; RV32MV-NEXT: lui a1, 783784 +; RV32MV-NEXT: lui a1, 128424 ; RV32MV-NEXT: addi a1, a1, 331 ; RV32MV-NEXT: mul a0, a0, a1 -; RV32MV-NEXT: lui a1, 5296 -; RV32MV-NEXT: addi a1, a1, -662 +; RV32MV-NEXT: lui a1, 662 +; RV32MV-NEXT: addi a1, a1, -83 ; RV32MV-NEXT: add a0, a0, a1 -; RV32MV-NEXT: lui a1, 10592 -; RV32MV-NEXT: addi a1, a1, -1323 +; RV32MV-NEXT: lui a1, 131072 +; RV32MV-NEXT: addi a1, a1, -1 +; RV32MV-NEXT: and a0, a0, a1 +; RV32MV-NEXT: lui a1, 1324 +; RV32MV-NEXT: addi a1, a1, -165 ; RV32MV-NEXT: sltu a0, a0, a1 ; RV32MV-NEXT: ret ; ; RV64MV-LABEL: test_srem_odd: ; RV64MV: # %bb.0: -; RV64MV-NEXT: slli a0, a0, 35 -; RV64MV-NEXT: srai a0, a0, 35 -; RV64MV-NEXT: lui a1, 1048536 -; RV64MV-NEXT: addiw a1, a1, -331 -; RV64MV-NEXT: slli a1, a1, 15 -; RV64MV-NEXT: addi a1, a1, 331 -; RV64MV-NEXT: slli a1, a1, 15 -; RV64MV-NEXT: addi a1, a1, -331 -; RV64MV-NEXT: slli a1, a1, 15 -; RV64MV-NEXT: addi a1, a1, 331 +; RV64MV-NEXT: lui a1, 128424 +; RV64MV-NEXT: addiw a1, a1, 331 ; RV64MV-NEXT: mul a0, a0, a1 -; RV64MV-NEXT: lui a1, 331 -; RV64MV-NEXT: addiw a1, a1, -41 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -1531 -; RV64MV-NEXT: slli a2, a1, 12 -; RV64MV-NEXT: addi a2, a2, 703 -; RV64MV-NEXT: slli a2, a2, 12 -; RV64MV-NEXT: addi a2, a2, 1448 -; RV64MV-NEXT: add a0, a0, a2 -; RV64MV-NEXT: slli a1, a1, 13 -; RV64MV-NEXT: addi a1, a1, 1407 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -1199 +; RV64MV-NEXT: lui a1, 662 +; RV64MV-NEXT: addiw a1, a1, -83 +; RV64MV-NEXT: add a0, a0, a1 +; RV64MV-NEXT: lui a1, 131072 +; RV64MV-NEXT: addiw a1, a1, -1 +; RV64MV-NEXT: and a0, a0, a1 +; RV64MV-NEXT: lui a1, 1324 +; RV64MV-NEXT: addiw a1, a1, -165 ; RV64MV-NEXT: sltu a0, a0, a1 ; RV64MV-NEXT: ret %srem = srem i29 %X, 99 @@ -423,46 +413,65 @@ ; RV64-NEXT: and a0, a0, a1 ; RV64-NEXT: ld a1, 0(s0) ; RV64-NEXT: slli a2, a0, 29 -; RV64-NEXT: srai s2, a2, 31 +; RV64-NEXT: srai s1, a2, 31 ; RV64-NEXT: slli a0, a0, 31 ; RV64-NEXT: srli a2, a1, 33 ; RV64-NEXT: or a0, a2, a0 ; RV64-NEXT: slli a0, a0, 31 -; RV64-NEXT: srai s1, a0, 31 -; RV64-NEXT: slli a0, a1, 31 ; RV64-NEXT: srai a0, a0, 31 -; RV64-NEXT: addi a1, zero, 6 -; RV64-NEXT: call __moddi3@plt -; RV64-NEXT: mv s3, a0 +; RV64-NEXT: slli a1, a1, 31 +; RV64-NEXT: srai s2, a1, 31 ; RV64-NEXT: addi a1, zero, 7 ; RV64-NEXT: addi s5, zero, 7 +; RV64-NEXT: call __moddi3@plt +; RV64-NEXT: mv s3, a0 +; RV64-NEXT: addi a1, zero, -5 ; RV64-NEXT: mv a0, s1 ; RV64-NEXT: call __moddi3@plt ; RV64-NEXT: mv s1, a0 -; RV64-NEXT: addi a1, zero, -5 +; RV64-NEXT: lui a0, 1026731 +; RV64-NEXT: addiw a0, a0, -1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -1365 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a1, a0, -1365 ; RV64-NEXT: mv a0, s2 -; RV64-NEXT: call __moddi3@plt -; RV64-NEXT: addi a0, a0, -2 -; RV64-NEXT: snez a0, a0 -; RV64-NEXT: addi a1, s1, -1 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: lui a1, 10923 +; RV64-NEXT: addiw a1, a1, -1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1366 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: slli a2, a0, 63 +; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: or a0, a0, a2 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: addi a1, s1, -2 ; RV64-NEXT: snez a1, a1 -; RV64-NEXT: snez a2, s3 +; RV64-NEXT: addi a2, s3, -1 +; RV64-NEXT: snez a2, a2 +; RV64-NEXT: neg a0, a0 ; RV64-NEXT: neg a2, a2 -; RV64-NEXT: neg a1, a1 -; RV64-NEXT: neg a3, a0 +; RV64-NEXT: neg a3, a1 ; RV64-NEXT: slli a4, s5, 32 ; RV64-NEXT: and a3, a3, a4 ; RV64-NEXT: srli a3, a3, 32 ; RV64-NEXT: sb a3, 12(s0) -; RV64-NEXT: slli a0, a0, 2 +; RV64-NEXT: slli a1, a1, 2 ; RV64-NEXT: slli a3, s4, 33 ; RV64-NEXT: addi a3, a3, -1 -; RV64-NEXT: and a1, a1, a3 -; RV64-NEXT: srli a4, a1, 31 -; RV64-NEXT: sub a0, a4, a0 -; RV64-NEXT: sw a0, 8(s0) -; RV64-NEXT: and a0, a2, a3 -; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: and a2, a2, a3 +; RV64-NEXT: srli a4, a2, 31 +; RV64-NEXT: sub a1, a4, a1 +; RV64-NEXT: sw a1, 8(s0) +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: slli a1, a2, 33 ; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: sd a0, 0(s0) ; RV64-NEXT: ld s5, 8(sp) # 8-byte Folded Reload @@ -609,7 +618,11 @@ ; RV64M-NEXT: slli a5, a2, 2 ; RV64M-NEXT: add a2, a5, a2 ; RV64M-NEXT: add a2, a4, a2 -; RV64M-NEXT: lui a4, 10923 +; RV64M-NEXT: addi a2, a2, -2 +; RV64M-NEXT: snez a2, a2 +; RV64M-NEXT: addi a1, a1, -1 +; RV64M-NEXT: snez a1, a1 +; RV64M-NEXT: lui a4, 1026731 ; RV64M-NEXT: addiw a4, a4, -1365 ; RV64M-NEXT: slli a4, a4, 12 ; RV64M-NEXT: addi a4, a4, -1365 @@ -617,17 +630,20 @@ ; RV64M-NEXT: addi a4, a4, -1365 ; RV64M-NEXT: slli a4, a4, 12 ; RV64M-NEXT: addi a4, a4, -1365 -; RV64M-NEXT: mulh a4, a3, a4 -; RV64M-NEXT: srli a5, a4, 63 -; RV64M-NEXT: add a4, a4, a5 -; RV64M-NEXT: addi a5, zero, 6 -; RV64M-NEXT: mul a4, a4, a5 -; RV64M-NEXT: sub a3, a3, a4 -; RV64M-NEXT: addi a2, a2, -2 -; RV64M-NEXT: snez a2, a2 -; RV64M-NEXT: addi a1, a1, -1 -; RV64M-NEXT: snez a1, a1 -; RV64M-NEXT: snez a3, a3 +; RV64M-NEXT: mul a3, a3, a4 +; RV64M-NEXT: lui a4, 10923 +; RV64M-NEXT: addiw a4, a4, -1365 +; RV64M-NEXT: slli a4, a4, 12 +; RV64M-NEXT: addi a4, a4, -1365 +; RV64M-NEXT: slli a4, a4, 12 +; RV64M-NEXT: addi a4, a4, -1365 +; RV64M-NEXT: slli a4, a4, 12 +; RV64M-NEXT: addi a4, a4, -1366 +; RV64M-NEXT: add a3, a3, a4 +; RV64M-NEXT: slli a5, a3, 63 +; RV64M-NEXT: srli a3, a3, 1 +; RV64M-NEXT: or a3, a3, a5 +; RV64M-NEXT: sltu a3, a4, a3 ; RV64M-NEXT: neg a1, a1 ; RV64M-NEXT: neg a4, a2 ; RV64M-NEXT: neg a3, a3 diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -11,12 +11,13 @@ ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: lui a1, 1 +; RV32-NEXT: addi a1, a1, -819 +; RV32-NEXT: call __mulsi3@plt ; RV32-NEXT: lui a1, 2 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: addi a1, zero, 5 -; RV32-NEXT: call __umodsi3@plt -; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: sltiu a0, a0, 1639 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -25,90 +26,59 @@ ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: lui a1, 1 +; RV64-NEXT: addiw a1, a1, -819 +; RV64-NEXT: call __muldi3@plt ; RV64-NEXT: lui a1, 2 ; RV64-NEXT: addiw a1, a1, -1 ; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: addi a1, zero, 5 -; RV64-NEXT: call __umoddi3@plt -; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: sltiu a0, a0, 1639 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; RV32M-LABEL: test_urem_odd: ; RV32M: # %bb.0: +; RV32M-NEXT: lui a1, 1 +; RV32M-NEXT: addi a1, a1, -819 +; RV32M-NEXT: mul a0, a0, a1 ; RV32M-NEXT: lui a1, 2 ; RV32M-NEXT: addi a1, a1, -1 ; RV32M-NEXT: and a0, a0, a1 -; RV32M-NEXT: lui a1, 838861 -; RV32M-NEXT: addi a1, a1, -819 -; RV32M-NEXT: mul a0, a0, a1 -; RV32M-NEXT: lui a1, 209715 -; RV32M-NEXT: addi a1, a1, 820 -; RV32M-NEXT: sltu a0, a0, a1 +; RV32M-NEXT: sltiu a0, a0, 1639 ; RV32M-NEXT: ret ; ; RV64M-LABEL: test_urem_odd: ; RV64M: # %bb.0: +; RV64M-NEXT: lui a1, 1 +; RV64M-NEXT: addiw a1, a1, -819 +; RV64M-NEXT: mul a0, a0, a1 ; RV64M-NEXT: lui a1, 2 ; RV64M-NEXT: addiw a1, a1, -1 ; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: lui a1, 1035469 -; RV64M-NEXT: addiw a1, a1, -819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, -819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, -819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, -819 -; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: lui a1, 13107 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, 819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, 819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, 820 -; RV64M-NEXT: sltu a0, a0, a1 +; RV64M-NEXT: sltiu a0, a0, 1639 ; RV64M-NEXT: ret ; ; RV32MV-LABEL: test_urem_odd: ; RV32MV: # %bb.0: +; RV32MV-NEXT: lui a1, 1 +; RV32MV-NEXT: addi a1, a1, -819 +; RV32MV-NEXT: mul a0, a0, a1 ; RV32MV-NEXT: lui a1, 2 ; RV32MV-NEXT: addi a1, a1, -1 ; RV32MV-NEXT: and a0, a0, a1 -; RV32MV-NEXT: lui a1, 838861 -; RV32MV-NEXT: addi a1, a1, -819 -; RV32MV-NEXT: mul a0, a0, a1 -; RV32MV-NEXT: lui a1, 209715 -; RV32MV-NEXT: addi a1, a1, 820 -; RV32MV-NEXT: sltu a0, a0, a1 +; RV32MV-NEXT: sltiu a0, a0, 1639 ; RV32MV-NEXT: ret ; ; RV64MV-LABEL: test_urem_odd: ; RV64MV: # %bb.0: +; RV64MV-NEXT: lui a1, 1 +; RV64MV-NEXT: addiw a1, a1, -819 +; RV64MV-NEXT: mul a0, a0, a1 ; RV64MV-NEXT: lui a1, 2 ; RV64MV-NEXT: addiw a1, a1, -1 ; RV64MV-NEXT: and a0, a0, a1 -; RV64MV-NEXT: lui a1, 1035469 -; RV64MV-NEXT: addiw a1, a1, -819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -819 -; RV64MV-NEXT: mul a0, a0, a1 -; RV64MV-NEXT: lui a1, 13107 -; RV64MV-NEXT: addiw a1, a1, 819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, 819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, 819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, 820 -; RV64MV-NEXT: sltu a0, a0, a1 +; RV64MV-NEXT: sltiu a0, a0, 1639 ; RV64MV-NEXT: ret %urem = urem i13 %X, 5 %cmp = icmp eq i13 %urem, 0 @@ -120,12 +90,20 @@ ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: lui a1, 32768 -; RV32-NEXT: addi a1, a1, -1 +; RV32-NEXT: lui a1, 28087 +; RV32-NEXT: addi a1, a1, -585 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: slli a1, a0, 26 +; RV32-NEXT: lui a2, 32768 +; RV32-NEXT: addi a3, a2, -2 +; RV32-NEXT: and a0, a0, a3 +; RV32-NEXT: srli a0, a0, 1 +; RV32-NEXT: or a0, a0, a1 +; RV32-NEXT: addi a1, a2, -1 ; RV32-NEXT: and a0, a0, a1 -; RV32-NEXT: addi a1, zero, 14 -; RV32-NEXT: call __umodsi3@plt -; RV32-NEXT: seqz a0, a0 +; RV32-NEXT: lui a1, 2341 +; RV32-NEXT: addi a1, a1, -1755 +; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -134,90 +112,94 @@ ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: lui a1, 32768 -; RV64-NEXT: addiw a1, a1, -1 +; RV64-NEXT: lui a1, 28087 +; RV64-NEXT: addiw a1, a1, -585 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: slli a1, a0, 26 +; RV64-NEXT: lui a2, 32768 +; RV64-NEXT: addiw a3, a2, -2 +; RV64-NEXT: and a0, a0, a3 +; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: or a0, a0, a1 +; RV64-NEXT: addiw a1, a2, -1 ; RV64-NEXT: and a0, a0, a1 -; RV64-NEXT: addi a1, zero, 14 -; RV64-NEXT: call __umoddi3@plt -; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: lui a1, 2341 +; RV64-NEXT: addiw a1, a1, -1755 +; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; RV32M-LABEL: test_urem_even: ; RV32M: # %bb.0: -; RV32M-NEXT: lui a1, 32768 -; RV32M-NEXT: addi a1, a1, -1 +; RV32M-NEXT: lui a1, 28087 +; RV32M-NEXT: addi a1, a1, -585 +; RV32M-NEXT: mul a0, a0, a1 +; RV32M-NEXT: slli a1, a0, 26 +; RV32M-NEXT: lui a2, 32768 +; RV32M-NEXT: addi a3, a2, -2 +; RV32M-NEXT: and a0, a0, a3 +; RV32M-NEXT: srli a0, a0, 1 +; RV32M-NEXT: or a0, a0, a1 +; RV32M-NEXT: addi a1, a2, -1 ; RV32M-NEXT: and a0, a0, a1 -; RV32M-NEXT: srli a1, a0, 1 -; RV32M-NEXT: lui a2, 599186 -; RV32M-NEXT: addi a2, a2, 1171 -; RV32M-NEXT: mulhu a1, a1, a2 -; RV32M-NEXT: srli a1, a1, 2 -; RV32M-NEXT: addi a2, zero, 14 -; RV32M-NEXT: mul a1, a1, a2 -; RV32M-NEXT: sub a0, a0, a1 -; RV32M-NEXT: seqz a0, a0 +; RV32M-NEXT: lui a1, 2341 +; RV32M-NEXT: addi a1, a1, -1755 +; RV32M-NEXT: sltu a0, a0, a1 ; RV32M-NEXT: ret ; ; RV64M-LABEL: test_urem_even: ; RV64M: # %bb.0: -; RV64M-NEXT: lui a1, 32768 -; RV64M-NEXT: addiw a1, a1, -1 +; RV64M-NEXT: lui a1, 28087 +; RV64M-NEXT: addiw a1, a1, -585 +; RV64M-NEXT: mul a0, a0, a1 +; RV64M-NEXT: slli a1, a0, 26 +; RV64M-NEXT: lui a2, 32768 +; RV64M-NEXT: addiw a3, a2, -2 +; RV64M-NEXT: and a0, a0, a3 +; RV64M-NEXT: srli a0, a0, 1 +; RV64M-NEXT: or a0, a0, a1 +; RV64M-NEXT: addiw a1, a2, -1 ; RV64M-NEXT: and a0, a0, a1 -; RV64M-NEXT: srli a1, a0, 1 -; RV64M-NEXT: lui a2, 18725 -; RV64M-NEXT: addiw a2, a2, -1755 -; RV64M-NEXT: slli a2, a2, 12 -; RV64M-NEXT: addi a2, a2, -1755 -; RV64M-NEXT: slli a2, a2, 12 -; RV64M-NEXT: addi a2, a2, -1755 -; RV64M-NEXT: slli a2, a2, 12 -; RV64M-NEXT: addi a2, a2, -1755 -; RV64M-NEXT: mulhu a1, a1, a2 -; RV64M-NEXT: srli a1, a1, 1 -; RV64M-NEXT: addi a2, zero, 14 -; RV64M-NEXT: mul a1, a1, a2 -; RV64M-NEXT: sub a0, a0, a1 -; RV64M-NEXT: seqz a0, a0 +; RV64M-NEXT: lui a1, 2341 +; RV64M-NEXT: addiw a1, a1, -1755 +; RV64M-NEXT: sltu a0, a0, a1 ; RV64M-NEXT: ret ; ; RV32MV-LABEL: test_urem_even: ; RV32MV: # %bb.0: -; RV32MV-NEXT: lui a1, 32768 -; RV32MV-NEXT: addi a1, a1, -1 +; RV32MV-NEXT: lui a1, 28087 +; RV32MV-NEXT: addi a1, a1, -585 +; RV32MV-NEXT: mul a0, a0, a1 +; RV32MV-NEXT: slli a1, a0, 26 +; RV32MV-NEXT: lui a2, 32768 +; RV32MV-NEXT: addi a3, a2, -2 +; RV32MV-NEXT: and a0, a0, a3 +; RV32MV-NEXT: srli a0, a0, 1 +; RV32MV-NEXT: or a0, a0, a1 +; RV32MV-NEXT: addi a1, a2, -1 ; RV32MV-NEXT: and a0, a0, a1 -; RV32MV-NEXT: srli a1, a0, 1 -; RV32MV-NEXT: lui a2, 599186 -; RV32MV-NEXT: addi a2, a2, 1171 -; RV32MV-NEXT: mulhu a1, a1, a2 -; RV32MV-NEXT: srli a1, a1, 2 -; RV32MV-NEXT: addi a2, zero, 14 -; RV32MV-NEXT: mul a1, a1, a2 -; RV32MV-NEXT: sub a0, a0, a1 -; RV32MV-NEXT: seqz a0, a0 +; RV32MV-NEXT: lui a1, 2341 +; RV32MV-NEXT: addi a1, a1, -1755 +; RV32MV-NEXT: sltu a0, a0, a1 ; RV32MV-NEXT: ret ; ; RV64MV-LABEL: test_urem_even: ; RV64MV: # %bb.0: -; RV64MV-NEXT: lui a1, 32768 -; RV64MV-NEXT: addiw a1, a1, -1 +; RV64MV-NEXT: lui a1, 28087 +; RV64MV-NEXT: addiw a1, a1, -585 +; RV64MV-NEXT: mul a0, a0, a1 +; RV64MV-NEXT: slli a1, a0, 26 +; RV64MV-NEXT: lui a2, 32768 +; RV64MV-NEXT: addiw a3, a2, -2 +; RV64MV-NEXT: and a0, a0, a3 +; RV64MV-NEXT: srli a0, a0, 1 +; RV64MV-NEXT: or a0, a0, a1 +; RV64MV-NEXT: addiw a1, a2, -1 ; RV64MV-NEXT: and a0, a0, a1 -; RV64MV-NEXT: srli a1, a0, 1 -; RV64MV-NEXT: lui a2, 18725 -; RV64MV-NEXT: addiw a2, a2, -1755 -; RV64MV-NEXT: slli a2, a2, 12 -; RV64MV-NEXT: addi a2, a2, -1755 -; RV64MV-NEXT: slli a2, a2, 12 -; RV64MV-NEXT: addi a2, a2, -1755 -; RV64MV-NEXT: slli a2, a2, 12 -; RV64MV-NEXT: addi a2, a2, -1755 -; RV64MV-NEXT: mulhu a1, a1, a2 -; RV64MV-NEXT: srli a1, a1, 1 -; RV64MV-NEXT: addi a2, zero, 14 -; RV64MV-NEXT: mul a1, a1, a2 -; RV64MV-NEXT: sub a0, a0, a1 -; RV64MV-NEXT: seqz a0, a0 +; RV64MV-NEXT: lui a1, 2341 +; RV64MV-NEXT: addiw a1, a1, -1755 +; RV64MV-NEXT: sltu a0, a0, a1 ; RV64MV-NEXT: ret %urem = urem i27 %X, 14 %cmp = icmp eq i27 %urem, 0 @@ -227,93 +209,61 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; RV32-LABEL: test_urem_odd_setne: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: slli a1, a0, 1 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: neg a0, a0 ; RV32-NEXT: andi a0, a0, 15 -; RV32-NEXT: addi a1, zero, 5 -; RV32-NEXT: call __umodsi3@plt -; RV32-NEXT: snez a0, a0 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: addi a1, zero, 3 +; RV32-NEXT: sltu a0, a1, a0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_odd_setne: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: slli a1, a0, 1 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: neg a0, a0 ; RV64-NEXT: andi a0, a0, 15 -; RV64-NEXT: addi a1, zero, 5 -; RV64-NEXT: call __umoddi3@plt -; RV64-NEXT: snez a0, a0 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: addi a1, zero, 3 +; RV64-NEXT: sltu a0, a1, a0 ; RV64-NEXT: ret ; ; RV32M-LABEL: test_urem_odd_setne: ; RV32M: # %bb.0: +; RV32M-NEXT: slli a1, a0, 1 +; RV32M-NEXT: add a0, a1, a0 +; RV32M-NEXT: neg a0, a0 ; RV32M-NEXT: andi a0, a0, 15 -; RV32M-NEXT: lui a1, 838861 -; RV32M-NEXT: addi a1, a1, -819 -; RV32M-NEXT: mul a0, a0, a1 -; RV32M-NEXT: lui a1, 209715 -; RV32M-NEXT: addi a1, a1, 819 +; RV32M-NEXT: addi a1, zero, 3 ; RV32M-NEXT: sltu a0, a1, a0 ; RV32M-NEXT: ret ; ; RV64M-LABEL: test_urem_odd_setne: ; RV64M: # %bb.0: +; RV64M-NEXT: slli a1, a0, 1 +; RV64M-NEXT: add a0, a1, a0 +; RV64M-NEXT: neg a0, a0 ; RV64M-NEXT: andi a0, a0, 15 -; RV64M-NEXT: lui a1, 1035469 -; RV64M-NEXT: addiw a1, a1, -819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, -819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, -819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, -819 -; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: lui a1, 13107 -; RV64M-NEXT: addiw a1, a1, 819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, 819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, 819 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, 819 +; RV64M-NEXT: addi a1, zero, 3 ; RV64M-NEXT: sltu a0, a1, a0 ; RV64M-NEXT: ret ; ; RV32MV-LABEL: test_urem_odd_setne: ; RV32MV: # %bb.0: +; RV32MV-NEXT: slli a1, a0, 1 +; RV32MV-NEXT: add a0, a1, a0 +; RV32MV-NEXT: neg a0, a0 ; RV32MV-NEXT: andi a0, a0, 15 -; RV32MV-NEXT: lui a1, 838861 -; RV32MV-NEXT: addi a1, a1, -819 -; RV32MV-NEXT: mul a0, a0, a1 -; RV32MV-NEXT: lui a1, 209715 -; RV32MV-NEXT: addi a1, a1, 819 +; RV32MV-NEXT: addi a1, zero, 3 ; RV32MV-NEXT: sltu a0, a1, a0 ; RV32MV-NEXT: ret ; ; RV64MV-LABEL: test_urem_odd_setne: ; RV64MV: # %bb.0: +; RV64MV-NEXT: slli a1, a0, 1 +; RV64MV-NEXT: add a0, a1, a0 +; RV64MV-NEXT: neg a0, a0 ; RV64MV-NEXT: andi a0, a0, 15 -; RV64MV-NEXT: lui a1, 1035469 -; RV64MV-NEXT: addiw a1, a1, -819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -819 -; RV64MV-NEXT: mul a0, a0, a1 -; RV64MV-NEXT: lui a1, 13107 -; RV64MV-NEXT: addiw a1, a1, 819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, 819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, 819 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, 819 +; RV64MV-NEXT: addi a1, zero, 3 ; RV64MV-NEXT: sltu a0, a1, a0 ; RV64MV-NEXT: ret %urem = urem i4 %X, 5 @@ -326,10 +276,11 @@ ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: addi a1, zero, 307 +; RV32-NEXT: call __mulsi3@plt ; RV32-NEXT: andi a0, a0, 511 -; RV32-NEXT: addi a1, zero, 507 -; RV32-NEXT: call __umodsi3@plt -; RV32-NEXT: snez a0, a0 +; RV32-NEXT: addi a1, zero, 1 +; RV32-NEXT: sltu a0, a1, a0 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -338,75 +289,48 @@ ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: addi a1, zero, 307 +; RV64-NEXT: call __muldi3@plt ; RV64-NEXT: andi a0, a0, 511 -; RV64-NEXT: addi a1, zero, 507 -; RV64-NEXT: call __umoddi3@plt -; RV64-NEXT: snez a0, a0 +; RV64-NEXT: addi a1, zero, 1 +; RV64-NEXT: sltu a0, a1, a0 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret ; ; RV32M-LABEL: test_urem_negative_odd: ; RV32M: # %bb.0: -; RV32M-NEXT: andi a0, a0, 511 -; RV32M-NEXT: lui a1, 692846 -; RV32M-NEXT: addi a1, a1, 307 +; RV32M-NEXT: addi a1, zero, 307 ; RV32M-NEXT: mul a0, a0, a1 -; RV32M-NEXT: lui a1, 2068 -; RV32M-NEXT: addi a1, a1, 807 +; RV32M-NEXT: andi a0, a0, 511 +; RV32M-NEXT: addi a1, zero, 1 ; RV32M-NEXT: sltu a0, a1, a0 ; RV32M-NEXT: ret ; ; RV64M-LABEL: test_urem_negative_odd: ; RV64M: # %bb.0: -; RV64M-NEXT: andi a0, a0, 511 -; RV64M-NEXT: lui a1, 1042824 -; RV64M-NEXT: addiw a1, a1, -711 -; RV64M-NEXT: slli a1, a1, 13 -; RV64M-NEXT: addi a1, a1, 469 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, -1737 -; RV64M-NEXT: slli a1, a1, 13 -; RV64M-NEXT: addi a1, a1, 307 +; RV64M-NEXT: addi a1, zero, 307 ; RV64M-NEXT: mul a0, a0, a1 -; RV64M-NEXT: lui a1, 132365 -; RV64M-NEXT: addiw a1, a1, -1543 -; RV64M-NEXT: slli a1, a1, 14 -; RV64M-NEXT: addi a1, a1, -1131 -; RV64M-NEXT: slli a1, a1, 12 -; RV64M-NEXT: addi a1, a1, -186 +; RV64M-NEXT: andi a0, a0, 511 +; RV64M-NEXT: addi a1, zero, 1 ; RV64M-NEXT: sltu a0, a1, a0 ; RV64M-NEXT: ret ; ; RV32MV-LABEL: test_urem_negative_odd: ; RV32MV: # %bb.0: -; RV32MV-NEXT: andi a0, a0, 511 -; RV32MV-NEXT: lui a1, 692846 -; RV32MV-NEXT: addi a1, a1, 307 +; RV32MV-NEXT: addi a1, zero, 307 ; RV32MV-NEXT: mul a0, a0, a1 -; RV32MV-NEXT: lui a1, 2068 -; RV32MV-NEXT: addi a1, a1, 807 +; RV32MV-NEXT: andi a0, a0, 511 +; RV32MV-NEXT: addi a1, zero, 1 ; RV32MV-NEXT: sltu a0, a1, a0 ; RV32MV-NEXT: ret ; ; RV64MV-LABEL: test_urem_negative_odd: ; RV64MV: # %bb.0: -; RV64MV-NEXT: andi a0, a0, 511 -; RV64MV-NEXT: lui a1, 1042824 -; RV64MV-NEXT: addiw a1, a1, -711 -; RV64MV-NEXT: slli a1, a1, 13 -; RV64MV-NEXT: addi a1, a1, 469 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -1737 -; RV64MV-NEXT: slli a1, a1, 13 -; RV64MV-NEXT: addi a1, a1, 307 +; RV64MV-NEXT: addi a1, zero, 307 ; RV64MV-NEXT: mul a0, a0, a1 -; RV64MV-NEXT: lui a1, 132365 -; RV64MV-NEXT: addiw a1, a1, -1543 -; RV64MV-NEXT: slli a1, a1, 14 -; RV64MV-NEXT: addi a1, a1, -1131 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -186 +; RV64MV-NEXT: andi a0, a0, 511 +; RV64MV-NEXT: addi a1, zero, 1 ; RV64MV-NEXT: sltu a0, a1, a0 ; RV64MV-NEXT: ret %urem = urem i9 %X, -5 @@ -428,38 +352,44 @@ ; RV32-NEXT: lw a1, 0(s0) ; RV32-NEXT: slli a0, a0, 10 ; RV32-NEXT: srli a2, a1, 22 -; RV32-NEXT: or a0, a2, a0 -; RV32-NEXT: andi s2, a0, 2047 -; RV32-NEXT: andi s1, a1, 2047 -; RV32-NEXT: srli a0, a1, 11 +; RV32-NEXT: or s1, a2, a0 +; RV32-NEXT: srli s2, a1, 11 +; RV32-NEXT: andi a0, a1, 2047 +; RV32-NEXT: addi a1, zero, 683 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: slli a1, a0, 10 +; RV32-NEXT: andi a0, a0, 2046 +; RV32-NEXT: srli a0, a0, 1 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: andi a0, a0, 2047 -; RV32-NEXT: addi a1, zero, 7 -; RV32-NEXT: call __umodsi3@plt -; RV32-NEXT: mv s3, a0 -; RV32-NEXT: addi a1, zero, 6 +; RV32-NEXT: addi a1, zero, 341 +; RV32-NEXT: sltu s3, a1, a0 +; RV32-NEXT: addi a1, zero, 819 ; RV32-NEXT: mv a0, s1 -; RV32-NEXT: call __umodsi3@plt -; RV32-NEXT: mv s1, a0 -; RV32-NEXT: addi a1, zero, 2043 +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: addi a0, a0, -1638 +; RV32-NEXT: andi a0, a0, 2047 +; RV32-NEXT: addi a1, zero, 1 +; RV32-NEXT: sltu s1, a1, a0 +; RV32-NEXT: addi a1, zero, 1463 ; RV32-NEXT: mv a0, s2 -; RV32-NEXT: call __umodsi3@plt -; RV32-NEXT: addi a0, a0, -2 -; RV32-NEXT: snez a0, a0 -; RV32-NEXT: snez a1, s1 -; RV32-NEXT: addi a2, s3, -1 -; RV32-NEXT: snez a2, a2 -; RV32-NEXT: neg a2, a2 -; RV32-NEXT: neg a1, a1 -; RV32-NEXT: neg a3, a0 -; RV32-NEXT: srli a3, a3, 10 -; RV32-NEXT: andi a3, a3, 1 -; RV32-NEXT: sb a3, 4(s0) +; RV32-NEXT: call __mulsi3@plt +; RV32-NEXT: addi a0, a0, -1463 +; RV32-NEXT: andi a0, a0, 2047 +; RV32-NEXT: addi a1, zero, 292 +; RV32-NEXT: sltu a0, a1, a0 +; RV32-NEXT: neg a1, s3 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: neg a2, s1 +; RV32-NEXT: srli a2, a2, 10 +; RV32-NEXT: andi a2, a2, 1 +; RV32-NEXT: sb a2, 4(s0) ; RV32-NEXT: andi a1, a1, 2047 -; RV32-NEXT: andi a2, a2, 2047 -; RV32-NEXT: slli a2, a2, 11 -; RV32-NEXT: or a1, a1, a2 -; RV32-NEXT: slli a0, a0, 22 -; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: andi a0, a0, 2047 +; RV32-NEXT: slli a0, a0, 11 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: slli a1, s1, 22 +; RV32-NEXT: sub a0, a0, a1 ; RV32-NEXT: sw a0, 0(s0) ; RV32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s2, 16(sp) # 4-byte Folded Reload @@ -477,45 +407,53 @@ ; RV64-NEXT: sd s1, 24(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s3, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s4, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: mv s0, a0 ; RV64-NEXT: lbu a0, 4(a0) ; RV64-NEXT: lwu a1, 0(s0) ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: or a0, a1, a0 -; RV64-NEXT: srli s2, a0, 22 -; RV64-NEXT: andi s1, a0, 2047 -; RV64-NEXT: srli a0, a0, 11 +; RV64-NEXT: srli s2, a0, 11 +; RV64-NEXT: srli s1, a0, 22 +; RV64-NEXT: andi a0, a0, 2047 +; RV64-NEXT: addi a1, zero, 683 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: slli a1, a0, 10 +; RV64-NEXT: andi a0, a0, 2046 +; RV64-NEXT: srli a0, a0, 1 +; RV64-NEXT: or a0, a0, a1 ; RV64-NEXT: andi a0, a0, 2047 -; RV64-NEXT: addi a1, zero, 7 -; RV64-NEXT: call __umoddi3@plt -; RV64-NEXT: mv s3, a0 -; RV64-NEXT: addi a1, zero, 6 +; RV64-NEXT: addi a1, zero, 341 +; RV64-NEXT: sltu s3, a1, a0 +; RV64-NEXT: addi a1, zero, 819 ; RV64-NEXT: mv a0, s1 -; RV64-NEXT: call __umoddi3@plt -; RV64-NEXT: mv s1, a0 -; RV64-NEXT: addi a1, zero, 2043 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: addi a0, a0, -1638 +; RV64-NEXT: andi a0, a0, 2047 +; RV64-NEXT: addi s4, zero, 1 +; RV64-NEXT: sltu s1, s4, a0 +; RV64-NEXT: addi a1, zero, 1463 ; RV64-NEXT: mv a0, s2 -; RV64-NEXT: call __umoddi3@plt -; RV64-NEXT: addi a0, a0, -2 -; RV64-NEXT: snez a0, a0 -; RV64-NEXT: snez a1, s1 -; RV64-NEXT: addi a2, s3, -1 -; RV64-NEXT: snez a2, a2 -; RV64-NEXT: neg a2, a2 -; RV64-NEXT: neg a1, a1 +; RV64-NEXT: call __muldi3@plt +; RV64-NEXT: addi a0, a0, -1463 +; RV64-NEXT: andi a0, a0, 2047 +; RV64-NEXT: addi a1, zero, 292 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: neg a1, s3 +; RV64-NEXT: neg a0, a0 ; RV64-NEXT: andi a1, a1, 2047 -; RV64-NEXT: andi a2, a2, 2047 -; RV64-NEXT: slli a2, a2, 11 -; RV64-NEXT: or a1, a1, a2 -; RV64-NEXT: slli a0, a0, 22 -; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: andi a0, a0, 2047 +; RV64-NEXT: slli a0, a0, 11 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: slli a1, s1, 22 +; RV64-NEXT: sub a0, a0, a1 ; RV64-NEXT: sw a0, 0(s0) -; RV64-NEXT: addi a1, zero, 1 -; RV64-NEXT: slli a1, a1, 33 +; RV64-NEXT: slli a1, s4, 33 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a0, a0, a1 ; RV64-NEXT: srli a0, a0, 32 ; RV64-NEXT: sb a0, 4(s0) +; RV64-NEXT: ld s4, 0(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s3, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s2, 16(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s1, 24(sp) # 8-byte Folded Reload @@ -531,45 +469,38 @@ ; RV32M-NEXT: slli a1, a1, 10 ; RV32M-NEXT: srli a3, a2, 22 ; RV32M-NEXT: or a1, a3, a1 -; RV32M-NEXT: andi a1, a1, 2047 ; RV32M-NEXT: srli a3, a2, 11 -; RV32M-NEXT: andi a3, a3, 2047 ; RV32M-NEXT: andi a2, a2, 2047 -; RV32M-NEXT: lui a4, 699051 -; RV32M-NEXT: addi a4, a4, -1365 -; RV32M-NEXT: mulhu a4, a2, a4 -; RV32M-NEXT: srli a4, a4, 2 -; RV32M-NEXT: addi a5, zero, 6 -; RV32M-NEXT: mul a4, a4, a5 -; RV32M-NEXT: sub a2, a2, a4 -; RV32M-NEXT: lui a4, 536863 -; RV32M-NEXT: addi a4, a4, -1229 +; RV32M-NEXT: addi a4, zero, 683 +; RV32M-NEXT: mul a2, a2, a4 +; RV32M-NEXT: slli a4, a2, 10 +; RV32M-NEXT: andi a2, a2, 2046 +; RV32M-NEXT: srli a2, a2, 1 +; RV32M-NEXT: or a2, a2, a4 +; RV32M-NEXT: andi a2, a2, 2047 +; RV32M-NEXT: addi a4, zero, 341 +; RV32M-NEXT: sltu a2, a4, a2 +; RV32M-NEXT: addi a4, zero, 819 ; RV32M-NEXT: mul a1, a1, a4 -; RV32M-NEXT: lui a4, 1023427 -; RV32M-NEXT: addi a4, a4, -1638 -; RV32M-NEXT: add a1, a1, a4 -; RV32M-NEXT: lui a4, 513 -; RV32M-NEXT: addi a4, a4, 1036 +; RV32M-NEXT: addi a1, a1, -1638 +; RV32M-NEXT: andi a1, a1, 2047 +; RV32M-NEXT: addi a4, zero, 1 ; RV32M-NEXT: sltu a1, a4, a1 -; RV32M-NEXT: lui a4, 748983 -; RV32M-NEXT: addi a4, a4, -585 +; RV32M-NEXT: addi a4, zero, 1463 ; RV32M-NEXT: mul a3, a3, a4 -; RV32M-NEXT: lui a4, 299593 -; RV32M-NEXT: addi a4, a4, 585 -; RV32M-NEXT: add a3, a3, a4 -; RV32M-NEXT: lui a4, 149797 -; RV32M-NEXT: addi a4, a4, -1756 +; RV32M-NEXT: addi a3, a3, -1463 +; RV32M-NEXT: andi a3, a3, 2047 +; RV32M-NEXT: addi a4, zero, 292 ; RV32M-NEXT: sltu a3, a4, a3 -; RV32M-NEXT: snez a2, a2 ; RV32M-NEXT: neg a2, a2 ; RV32M-NEXT: neg a3, a3 ; RV32M-NEXT: neg a4, a1 ; RV32M-NEXT: srli a4, a4, 10 ; RV32M-NEXT: andi a4, a4, 1 ; RV32M-NEXT: sb a4, 4(a0) +; RV32M-NEXT: andi a2, a2, 2047 ; RV32M-NEXT: andi a3, a3, 2047 ; RV32M-NEXT: slli a3, a3, 11 -; RV32M-NEXT: andi a2, a2, 2047 ; RV32M-NEXT: or a2, a2, a3 ; RV32M-NEXT: slli a1, a1, 22 ; RV32M-NEXT: sub a1, a2, a1 @@ -583,75 +514,29 @@ ; RV64M-NEXT: slli a1, a1, 32 ; RV64M-NEXT: or a1, a2, a1 ; RV64M-NEXT: srli a2, a1, 11 -; RV64M-NEXT: andi a2, a2, 2047 ; RV64M-NEXT: srli a3, a1, 22 ; RV64M-NEXT: andi a1, a1, 2047 -; RV64M-NEXT: lui a4, 1026731 -; RV64M-NEXT: addiw a4, a4, -1365 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, -1365 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, -1365 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, -1365 -; RV64M-NEXT: mulhu a4, a1, a4 -; RV64M-NEXT: srli a4, a4, 2 -; RV64M-NEXT: addi a5, zero, 6 -; RV64M-NEXT: mul a4, a4, a5 -; RV64M-NEXT: sub a1, a1, a4 -; RV64M-NEXT: snez a1, a1 -; RV64M-NEXT: lui a4, 14948 -; RV64M-NEXT: addiw a4, a4, 2029 -; RV64M-NEXT: slli a4, a4, 13 -; RV64M-NEXT: addi a4, a4, -381 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, 287 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, -1229 +; RV64M-NEXT: addi a4, zero, 683 +; RV64M-NEXT: mul a1, a1, a4 +; RV64M-NEXT: slli a4, a1, 10 +; RV64M-NEXT: andi a1, a1, 2046 +; RV64M-NEXT: srli a1, a1, 1 +; RV64M-NEXT: or a1, a1, a4 +; RV64M-NEXT: andi a1, a1, 2047 +; RV64M-NEXT: addi a4, zero, 341 +; RV64M-NEXT: sltu a1, a4, a1 +; RV64M-NEXT: addi a4, zero, 819 ; RV64M-NEXT: mul a3, a3, a4 -; RV64M-NEXT: lui a4, 1436 -; RV64M-NEXT: addiw a4, a4, -2029 -; RV64M-NEXT: slli a4, a4, 13 -; RV64M-NEXT: addi a4, a4, 381 -; RV64M-NEXT: slli a4, a4, 13 -; RV64M-NEXT: addi a4, a4, -573 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, -1638 -; RV64M-NEXT: add a3, a3, a4 -; RV64M-NEXT: lui a4, 16424 -; RV64M-NEXT: addiw a4, a4, 401 -; RV64M-NEXT: slli a4, a4, 14 -; RV64M-NEXT: addi a4, a4, -345 -; RV64M-NEXT: slli a4, a4, 13 -; RV64M-NEXT: addi a4, a4, 1295 +; RV64M-NEXT: addi a3, a3, -1638 +; RV64M-NEXT: andi a3, a3, 2047 +; RV64M-NEXT: addi a4, zero, 1 ; RV64M-NEXT: sltu a3, a4, a3 -; RV64M-NEXT: lui a4, 28087 -; RV64M-NEXT: addiw a4, a4, -585 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, -585 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, -585 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, -585 -; RV64M-NEXT: mul a2, a2, a4 -; RV64M-NEXT: lui a4, 1020489 -; RV64M-NEXT: addiw a4, a4, 585 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, 585 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, 585 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, 585 -; RV64M-NEXT: add a2, a2, a4 -; RV64M-NEXT: lui a4, 4681 -; RV64M-NEXT: addiw a4, a4, 585 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, 585 -; RV64M-NEXT: slli a4, a4, 12 -; RV64M-NEXT: addi a4, a4, 585 -; RV64M-NEXT: slli a4, a4, 13 -; RV64M-NEXT: addi a4, a4, 1170 -; RV64M-NEXT: sltu a2, a4, a2 +; RV64M-NEXT: addi a5, zero, 1463 +; RV64M-NEXT: mul a2, a2, a5 +; RV64M-NEXT: addi a2, a2, -1463 +; RV64M-NEXT: andi a2, a2, 2047 +; RV64M-NEXT: addi a5, zero, 292 +; RV64M-NEXT: sltu a2, a5, a2 ; RV64M-NEXT: neg a1, a1 ; RV64M-NEXT: neg a2, a2 ; RV64M-NEXT: andi a1, a1, 2047 @@ -661,8 +546,7 @@ ; RV64M-NEXT: slli a2, a3, 22 ; RV64M-NEXT: sub a1, a1, a2 ; RV64M-NEXT: sw a1, 0(a0) -; RV64M-NEXT: addi a2, zero, 1 -; RV64M-NEXT: slli a2, a2, 33 +; RV64M-NEXT: slli a2, a4, 33 ; RV64M-NEXT: addi a2, a2, -1 ; RV64M-NEXT: and a1, a1, a2 ; RV64M-NEXT: srli a1, a1, 32 @@ -672,56 +556,50 @@ ; RV32MV-LABEL: test_urem_vec: ; RV32MV: # %bb.0: ; RV32MV-NEXT: addi sp, sp, -16 -; RV32MV-NEXT: lb a1, 4(a0) -; RV32MV-NEXT: lw a2, 0(a0) -; RV32MV-NEXT: slli a1, a1, 10 -; RV32MV-NEXT: srli a3, a2, 22 -; RV32MV-NEXT: or a1, a3, a1 -; RV32MV-NEXT: andi a1, a1, 2047 -; RV32MV-NEXT: srli a3, a2, 11 -; RV32MV-NEXT: andi a3, a3, 2047 -; RV32MV-NEXT: andi a2, a2, 2047 -; RV32MV-NEXT: lui a4, 699051 -; RV32MV-NEXT: addi a4, a4, -1365 -; RV32MV-NEXT: mulhu a4, a2, a4 -; RV32MV-NEXT: srli a4, a4, 2 -; RV32MV-NEXT: addi a5, zero, 6 -; RV32MV-NEXT: mul a4, a4, a5 -; RV32MV-NEXT: sub a2, a2, a4 +; RV32MV-NEXT: lw a1, 0(a0) +; RV32MV-NEXT: andi a2, a1, 2047 ; RV32MV-NEXT: sh a2, 8(sp) -; RV32MV-NEXT: lui a2, 2566 -; RV32MV-NEXT: addi a2, a2, 1087 -; RV32MV-NEXT: mulhu a2, a1, a2 -; RV32MV-NEXT: sub a4, a1, a2 -; RV32MV-NEXT: srli a4, a4, 1 -; RV32MV-NEXT: add a2, a4, a2 -; RV32MV-NEXT: srli a2, a2, 10 -; RV32MV-NEXT: addi a4, zero, 2043 -; RV32MV-NEXT: mul a2, a2, a4 -; RV32MV-NEXT: sub a1, a1, a2 +; RV32MV-NEXT: srli a2, a1, 11 +; RV32MV-NEXT: andi a2, a2, 2047 +; RV32MV-NEXT: sh a2, 10(sp) +; RV32MV-NEXT: lb a2, 4(a0) +; RV32MV-NEXT: slli a2, a2, 10 +; RV32MV-NEXT: srli a1, a1, 22 +; RV32MV-NEXT: or a1, a1, a2 +; RV32MV-NEXT: andi a1, a1, 2047 ; RV32MV-NEXT: sh a1, 12(sp) -; RV32MV-NEXT: lui a1, 149797 -; RV32MV-NEXT: addi a1, a1, -1755 -; RV32MV-NEXT: mulhu a1, a3, a1 -; RV32MV-NEXT: sub a2, a3, a1 -; RV32MV-NEXT: srli a2, a2, 1 -; RV32MV-NEXT: add a1, a2, a1 -; RV32MV-NEXT: srli a1, a1, 2 -; RV32MV-NEXT: slli a2, a1, 3 -; RV32MV-NEXT: sub a1, a1, a2 -; RV32MV-NEXT: add a1, a3, a1 -; RV32MV-NEXT: sh a1, 10(sp) ; RV32MV-NEXT: vsetivli a1, 4, e16,m1,ta,mu ; RV32MV-NEXT: addi a1, sp, 8 ; RV32MV-NEXT: vle16.v v25, (a1) ; RV32MV-NEXT: lui a1, %hi(.LCPI4_0) ; RV32MV-NEXT: addi a1, a1, %lo(.LCPI4_0) ; RV32MV-NEXT: vle16.v v26, (a1) +; RV32MV-NEXT: vid.v v27 +; RV32MV-NEXT: vsub.vv v25, v25, v27 +; RV32MV-NEXT: vmul.vv v25, v25, v26 +; RV32MV-NEXT: vsll.vi v26, v25, 1 +; RV32MV-NEXT: vmv.v.i v27, 10 +; RV32MV-NEXT: addi a1, zero, 9 +; RV32MV-NEXT: vsetvli a2, zero, e16,m1,ta,mu +; RV32MV-NEXT: vmv.s.x v27, a1 +; RV32MV-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; RV32MV-NEXT: vsll.vv v26, v26, v27 ; RV32MV-NEXT: addi a1, zero, 2047 ; RV32MV-NEXT: vand.vx v25, v25, a1 -; RV32MV-NEXT: vmsne.vv v0, v25, v26 -; RV32MV-NEXT: vmv.v.i v25, 0 -; RV32MV-NEXT: vmerge.vim v25, v25, -1, v0 +; RV32MV-NEXT: vmv.v.i v27, 0 +; RV32MV-NEXT: addi a2, zero, 1 +; RV32MV-NEXT: vsetvli a3, zero, e16,m1,ta,mu +; RV32MV-NEXT: vmv1r.v v28, v27 +; RV32MV-NEXT: vmv.s.x v28, a2 +; RV32MV-NEXT: vsetivli a2, 4, e16,m1,ta,mu +; RV32MV-NEXT: lui a2, %hi(.LCPI4_1) +; RV32MV-NEXT: addi a2, a2, %lo(.LCPI4_1) +; RV32MV-NEXT: vle16.v v29, (a2) +; RV32MV-NEXT: vsrl.vv v25, v25, v28 +; RV32MV-NEXT: vor.vv v25, v25, v26 +; RV32MV-NEXT: vand.vx v25, v25, a1 +; RV32MV-NEXT: vmsltu.vv v0, v29, v25 +; RV32MV-NEXT: vmerge.vim v25, v27, -1, v0 ; RV32MV-NEXT: vsetivli a1, 1, e16,m1,ta,mu ; RV32MV-NEXT: vslidedown.vi v26, v25, 2 ; RV32MV-NEXT: vmv.x.s a1, v26 @@ -748,54 +626,12 @@ ; RV64MV-NEXT: lwu a2, 0(a0) ; RV64MV-NEXT: slli a1, a1, 32 ; RV64MV-NEXT: or a1, a2, a1 -; RV64MV-NEXT: srli a2, a1, 11 -; RV64MV-NEXT: andi a2, a2, 2047 -; RV64MV-NEXT: andi a3, a1, 2047 -; RV64MV-NEXT: srli a1, a1, 22 -; RV64MV-NEXT: lui a4, 1027 -; RV64MV-NEXT: addiw a4, a4, -2023 -; RV64MV-NEXT: slli a4, a4, 15 -; RV64MV-NEXT: addi a4, a4, 2005 -; RV64MV-NEXT: slli a4, a4, 12 -; RV64MV-NEXT: addi a4, a4, -431 -; RV64MV-NEXT: slli a4, a4, 13 -; RV64MV-NEXT: addi a4, a4, -429 -; RV64MV-NEXT: mulhu a4, a1, a4 -; RV64MV-NEXT: srli a4, a4, 9 -; RV64MV-NEXT: addi a5, zero, 2043 -; RV64MV-NEXT: mul a4, a4, a5 -; RV64MV-NEXT: sub a1, a1, a4 -; RV64MV-NEXT: sh a1, 12(sp) -; RV64MV-NEXT: lui a1, 1026731 -; RV64MV-NEXT: addiw a1, a1, -1365 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -1365 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -1365 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, -1365 -; RV64MV-NEXT: mulhu a1, a3, a1 -; RV64MV-NEXT: srli a1, a1, 2 -; RV64MV-NEXT: addi a4, zero, 6 -; RV64MV-NEXT: mul a1, a1, a4 -; RV64MV-NEXT: sub a1, a3, a1 -; RV64MV-NEXT: sh a1, 8(sp) -; RV64MV-NEXT: lui a1, 4681 -; RV64MV-NEXT: addiw a1, a1, 585 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, 585 -; RV64MV-NEXT: slli a1, a1, 12 -; RV64MV-NEXT: addi a1, a1, 585 -; RV64MV-NEXT: slli a1, a1, 13 -; RV64MV-NEXT: addi a1, a1, 1171 -; RV64MV-NEXT: mulhu a1, a2, a1 -; RV64MV-NEXT: sub a3, a2, a1 -; RV64MV-NEXT: srli a3, a3, 1 -; RV64MV-NEXT: add a1, a3, a1 -; RV64MV-NEXT: srli a1, a1, 2 -; RV64MV-NEXT: slli a3, a1, 3 -; RV64MV-NEXT: sub a1, a1, a3 -; RV64MV-NEXT: add a1, a2, a1 +; RV64MV-NEXT: srli a2, a1, 22 +; RV64MV-NEXT: sh a2, 12(sp) +; RV64MV-NEXT: andi a2, a1, 2047 +; RV64MV-NEXT: sh a2, 8(sp) +; RV64MV-NEXT: srli a1, a1, 11 +; RV64MV-NEXT: andi a1, a1, 2047 ; RV64MV-NEXT: sh a1, 10(sp) ; RV64MV-NEXT: vsetivli a1, 4, e16,m1,ta,mu ; RV64MV-NEXT: addi a1, sp, 8 @@ -803,14 +639,34 @@ ; RV64MV-NEXT: lui a1, %hi(.LCPI4_0) ; RV64MV-NEXT: addi a1, a1, %lo(.LCPI4_0) ; RV64MV-NEXT: vle16.v v26, (a1) +; RV64MV-NEXT: vid.v v27 +; RV64MV-NEXT: vsub.vv v25, v25, v27 +; RV64MV-NEXT: vmul.vv v25, v25, v26 +; RV64MV-NEXT: vsll.vi v26, v25, 1 +; RV64MV-NEXT: vmv.v.i v27, 10 +; RV64MV-NEXT: addi a1, zero, 9 +; RV64MV-NEXT: vsetvli a2, zero, e16,m1,ta,mu +; RV64MV-NEXT: vmv.s.x v27, a1 +; RV64MV-NEXT: vsetivli a1, 4, e16,m1,ta,mu +; RV64MV-NEXT: vsll.vv v26, v26, v27 ; RV64MV-NEXT: addi a1, zero, 2047 ; RV64MV-NEXT: vand.vx v25, v25, a1 -; RV64MV-NEXT: vmsne.vv v0, v25, v26 -; RV64MV-NEXT: vmv.v.i v25, 0 -; RV64MV-NEXT: vmerge.vim v25, v25, -1, v0 +; RV64MV-NEXT: vmv.v.i v27, 0 +; RV64MV-NEXT: addi a2, zero, 1 +; RV64MV-NEXT: vsetvli a3, zero, e16,m1,ta,mu +; RV64MV-NEXT: vmv1r.v v28, v27 +; RV64MV-NEXT: vmv.s.x v28, a2 +; RV64MV-NEXT: vsetivli a3, 4, e16,m1,ta,mu +; RV64MV-NEXT: lui a3, %hi(.LCPI4_1) +; RV64MV-NEXT: addi a3, a3, %lo(.LCPI4_1) +; RV64MV-NEXT: vle16.v v29, (a3) +; RV64MV-NEXT: vsrl.vv v25, v25, v28 +; RV64MV-NEXT: vor.vv v25, v25, v26 +; RV64MV-NEXT: vand.vx v25, v25, a1 +; RV64MV-NEXT: vmsltu.vv v0, v29, v25 +; RV64MV-NEXT: vmerge.vim v25, v27, -1, v0 ; RV64MV-NEXT: vmv.x.s a1, v25 ; RV64MV-NEXT: andi a1, a1, 2047 -; RV64MV-NEXT: addi a2, zero, 1 ; RV64MV-NEXT: vsetivli a3, 1, e16,m1,ta,mu ; RV64MV-NEXT: vslidedown.vi v26, v25, 1 ; RV64MV-NEXT: vmv.x.s a3, v26 diff --git a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb/srem-seteq-illegal-types.ll @@ -4,8 +4,6 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; CHECK-LABEL: test_srem_odd: ; CHECK: @ %bb.0: -; CHECK-NEXT: lsls r0, r0, #3 -; CHECK-NEXT: asrs r0, r0, #3 ; CHECK-NEXT: ldr r1, .LCPI0_0 ; CHECK-NEXT: muls r1, r0, r1 ; CHECK-NEXT: ldr r0, .LCPI0_1 @@ -22,11 +20,11 @@ ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 3210379595 @ 0xbf5a814b +; CHECK-NEXT: .long 4208200280 @ 0xfad40a58 ; CHECK-NEXT: .LCPI0_1: -; CHECK-NEXT: .long 21691754 @ 0x14afd6a +; CHECK-NEXT: .long 21691752 @ 0x14afd68 ; CHECK-NEXT: .LCPI0_2: -; CHECK-NEXT: .long 43383509 @ 0x295fad5 +; CHECK-NEXT: .long 43383512 @ 0x295fad8 %srem = srem i29 %X, 99 %cmp = icmp eq i29 %srem, 0 ret i1 %cmp diff --git a/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb/urem-seteq-illegal-types.ll @@ -5,11 +5,9 @@ ; CHECK-LABEL: test_urem_odd: ; CHECK: @ %bb.0: ; CHECK-NEXT: ldr r1, .LCPI0_0 -; CHECK-NEXT: ands r1, r0 +; CHECK-NEXT: muls r1, r0, r1 ; CHECK-NEXT: ldr r0, .LCPI0_1 -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: ldr r1, .LCPI0_2 -; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: blo .LBB0_2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: movs r0, #0 @@ -20,11 +18,9 @@ ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 8191 @ 0x1fff +; CHECK-NEXT: .long 1718091776 @ 0x66680000 ; CHECK-NEXT: .LCPI0_1: -; CHECK-NEXT: .long 3435973837 @ 0xcccccccd -; CHECK-NEXT: .LCPI0_2: -; CHECK-NEXT: .long 858993460 @ 0x33333334 +; CHECK-NEXT: .long 859308032 @ 0x33380000 %urem = urem i13 %X, 5 %cmp = icmp eq i13 %urem, 0 ret i1 %cmp @@ -33,26 +29,31 @@ define i1 @test_urem_even(i27 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: @ %bb.0: -; CHECK-NEXT: movs r1, #31 -; CHECK-NEXT: lsls r1, r1, #27 -; CHECK-NEXT: bics r0, r1 ; CHECK-NEXT: ldr r1, .LCPI1_0 ; CHECK-NEXT: muls r1, r0, r1 -; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: rors r1, r0 +; CHECK-NEXT: lsls r0, r1, #26 ; CHECK-NEXT: ldr r2, .LCPI1_1 -; CHECK-NEXT: cmp r1, r2 +; CHECK-NEXT: ands r2, r1 +; CHECK-NEXT: lsrs r1, r2, #1 +; CHECK-NEXT: adds r0, r1, r0 +; CHECK-NEXT: lsls r0, r0, #5 +; CHECK-NEXT: ldr r1, .LCPI1_2 +; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo .LBB1_2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: bx lr ; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI1_0: -; CHECK-NEXT: .long 3067833783 @ 0xb6db6db7 +; CHECK-NEXT: .long 115043767 @ 0x6db6db7 ; CHECK-NEXT: .LCPI1_1: -; CHECK-NEXT: .long 306783379 @ 0x12492493 +; CHECK-NEXT: .long 134217726 @ 0x7fffffe +; CHECK-NEXT: .LCPI1_2: +; CHECK-NEXT: .long 306783392 @ 0x124924a0 %urem = urem i27 %X, 14 %cmp = icmp eq i27 %urem, 0 ret i1 %cmp @@ -61,12 +62,11 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; CHECK-LABEL: test_urem_odd_setne: ; CHECK: @ %bb.0: -; CHECK-NEXT: movs r1, #15 -; CHECK-NEXT: ands r1, r0 -; CHECK-NEXT: ldr r0, .LCPI2_0 -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: ldr r1, .LCPI2_1 -; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: movs r1, #13 +; CHECK-NEXT: muls r1, r0, r1 +; CHECK-NEXT: movs r0, #15 +; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: cmp r0, #3 ; CHECK-NEXT: bhi .LBB2_2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: movs r0, #0 @@ -74,12 +74,6 @@ ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: bx lr -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.3: -; CHECK-NEXT: .LCPI2_0: -; CHECK-NEXT: .long 3435973837 @ 0xcccccccd -; CHECK-NEXT: .LCPI2_1: -; CHECK-NEXT: .long 858993459 @ 0x33333333 %urem = urem i4 %X, 5 %cmp = icmp ne i4 %urem, 0 ret i1 %cmp @@ -88,12 +82,12 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { ; CHECK-LABEL: test_urem_negative_odd: ; CHECK: @ %bb.0: -; CHECK-NEXT: ldr r1, .LCPI3_0 -; CHECK-NEXT: ands r1, r0 -; CHECK-NEXT: ldr r0, .LCPI3_1 -; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: ldr r1, .LCPI3_2 -; CHECK-NEXT: cmp r0, r1 +; CHECK-NEXT: movs r1, #255 +; CHECK-NEXT: adds r1, #52 +; CHECK-NEXT: muls r1, r0, r1 +; CHECK-NEXT: ldr r0, .LCPI3_0 +; CHECK-NEXT: ands r0, r1 +; CHECK-NEXT: cmp r0, #1 ; CHECK-NEXT: bhi .LBB3_2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: movs r0, #0 @@ -105,10 +99,6 @@ ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI3_0: ; CHECK-NEXT: .long 511 @ 0x1ff -; CHECK-NEXT: .LCPI3_1: -; CHECK-NEXT: .long 2837897523 @ 0xa926e133 -; CHECK-NEXT: .LCPI3_2: -; CHECK-NEXT: .long 8471335 @ 0x814327 %urem = urem i9 %X, -5 %cmp = icmp ne i9 %urem, 0 ret i1 %cmp @@ -117,71 +107,73 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; CHECK-LABEL: test_urem_vec: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: movs r3, r2 -; CHECK-NEXT: ldr r5, .LCPI4_0 -; CHECK-NEXT: ands r0, r5 -; CHECK-NEXT: ldr r6, .LCPI4_1 -; CHECK-NEXT: muls r6, r0, r6 -; CHECK-NEXT: movs r2, #1 -; CHECK-NEXT: rors r6, r2 -; CHECK-NEXT: ldr r0, .LCPI4_2 +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: ldr r3, .LCPI4_0 +; CHECK-NEXT: muls r3, r0, r3 +; CHECK-NEXT: lsls r0, r3, #10 +; CHECK-NEXT: ldr r4, .LCPI4_1 +; CHECK-NEXT: ands r4, r3 +; CHECK-NEXT: lsrs r3, r4, #1 +; CHECK-NEXT: adds r0, r3, r0 +; CHECK-NEXT: ldr r3, .LCPI4_2 +; CHECK-NEXT: ands r3, r0 +; CHECK-NEXT: lsrs r0, r3, #1 +; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: cmp r6, r0 -; CHECK-NEXT: push {r2} +; CHECK-NEXT: cmp r0, #170 +; CHECK-NEXT: push {r3} ; CHECK-NEXT: pop {r0} ; CHECK-NEXT: bhi .LBB4_2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: movs r0, r4 ; CHECK-NEXT: .LBB4_2: -; CHECK-NEXT: ands r1, r5 -; CHECK-NEXT: ldr r6, .LCPI4_3 -; CHECK-NEXT: muls r6, r1, r6 +; CHECK-NEXT: ldr r5, .LCPI4_3 +; CHECK-NEXT: muls r5, r1, r5 ; CHECK-NEXT: ldr r1, .LCPI4_4 -; CHECK-NEXT: adds r1, r6, r1 -; CHECK-NEXT: ldr r6, .LCPI4_5 -; CHECK-NEXT: cmp r1, r6 -; CHECK-NEXT: push {r2} +; CHECK-NEXT: adds r1, r5, r1 +; CHECK-NEXT: movs r5, #73 +; CHECK-NEXT: lsls r5, r5, #23 +; CHECK-NEXT: cmp r1, r5 +; CHECK-NEXT: push {r3} ; CHECK-NEXT: pop {r1} ; CHECK-NEXT: bhi .LBB4_4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: movs r1, r4 ; CHECK-NEXT: .LBB4_4: -; CHECK-NEXT: ands r3, r5 -; CHECK-NEXT: ldr r5, .LCPI4_6 -; CHECK-NEXT: muls r5, r3, r5 -; CHECK-NEXT: ldr r3, .LCPI4_7 -; CHECK-NEXT: adds r3, r5, r3 -; CHECK-NEXT: ldr r5, .LCPI4_8 -; CHECK-NEXT: cmp r3, r5 +; CHECK-NEXT: ldr r5, .LCPI4_5 +; CHECK-NEXT: muls r5, r2, r5 +; CHECK-NEXT: ldr r2, .LCPI4_6 +; CHECK-NEXT: adds r2, r5, r2 +; CHECK-NEXT: ldr r5, .LCPI4_7 +; CHECK-NEXT: ands r5, r2 +; CHECK-NEXT: cmp r5, #1 ; CHECK-NEXT: bhi .LBB4_6 ; CHECK-NEXT: @ %bb.5: -; CHECK-NEXT: movs r2, r4 +; CHECK-NEXT: movs r3, r4 ; CHECK-NEXT: .LBB4_6: -; CHECK-NEXT: pop {r4, r5, r6} +; CHECK-NEXT: movs r2, r3 +; CHECK-NEXT: pop {r4, r5, r7} ; CHECK-NEXT: pop {r3} ; CHECK-NEXT: bx r3 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.7: ; CHECK-NEXT: .LCPI4_0: -; CHECK-NEXT: .long 2047 @ 0x7ff +; CHECK-NEXT: .long 683 @ 0x2ab ; CHECK-NEXT: .LCPI4_1: -; CHECK-NEXT: .long 2863311531 @ 0xaaaaaaab +; CHECK-NEXT: .long 2044 @ 0x7fc ; CHECK-NEXT: .LCPI4_2: -; CHECK-NEXT: .long 715827882 @ 0x2aaaaaaa +; CHECK-NEXT: .long 2046 @ 0x7fe ; CHECK-NEXT: .LCPI4_3: -; CHECK-NEXT: .long 3067833783 @ 0xb6db6db7 +; CHECK-NEXT: .long 3068133376 @ 0xb6e00000 ; CHECK-NEXT: .LCPI4_4: -; CHECK-NEXT: .long 1227133513 @ 0x49249249 +; CHECK-NEXT: .long 1226833920 @ 0x49200000 ; CHECK-NEXT: .LCPI4_5: -; CHECK-NEXT: .long 613566756 @ 0x24924924 +; CHECK-NEXT: .long 819 @ 0x333 ; CHECK-NEXT: .LCPI4_6: -; CHECK-NEXT: .long 2198989619 @ 0x8311eb33 +; CHECK-NEXT: .long 4294965658 @ 0xfffff99a ; CHECK-NEXT: .LCPI4_7: -; CHECK-NEXT: .long 4191955354 @ 0xf9dc299a -; CHECK-NEXT: .LCPI4_8: -; CHECK-NEXT: .long 2102284 @ 0x20140c +; CHECK-NEXT: .long 2047 @ 0x7ff %urem = urem <3 x i11> %X, %cmp = icmp ne <3 x i11> %urem, ret <3 x i1> %cmp diff --git a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb2/srem-seteq-illegal-types.ll @@ -4,15 +4,15 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; CHECK-LABEL: test_srem_odd: ; CHECK: @ %bb.0: -; CHECK-NEXT: movw r1, #64874 +; CHECK-NEXT: movw r1, #24493 ; CHECK-NEXT: movw r2, #33099 -; CHECK-NEXT: sbfx r0, r0, #0, #29 -; CHECK-NEXT: movt r1, #330 -; CHECK-NEXT: movt r2, #48986 -; CHECK-NEXT: mla r1, r0, r2, r1 -; CHECK-NEXT: movw r2, #64213 +; CHECK-NEXT: movt r1, #41 +; CHECK-NEXT: movt r2, #8026 +; CHECK-NEXT: mla r0, r0, r2, r1 +; CHECK-NEXT: movw r2, #48987 +; CHECK-NEXT: movt r2, #82 +; CHECK-NEXT: bic r1, r0, #-536870912 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: movt r2, #661 ; CHECK-NEXT: cmp r1, r2 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r0, #1 diff --git a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/Thumb2/urem-seteq-illegal-types.ll @@ -4,12 +4,12 @@ define i1 @test_urem_odd(i13 %X) nounwind { ; CHECK-LABEL: test_urem_odd: ; CHECK: @ %bb.0: -; CHECK-NEXT: movw r1, #52429 -; CHECK-NEXT: bfc r0, #13, #19 -; CHECK-NEXT: movt r1, #52428 +; CHECK-NEXT: movw r1, #3277 +; CHECK-NEXT: movw r2, #1639 ; CHECK-NEXT: muls r1, r0, r1 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: cmn.w r1, #-858993460 +; CHECK-NEXT: bfc r1, #13, #19 +; CHECK-NEXT: cmp r1, r2 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r0, #1 ; CHECK-NEXT: bx lr @@ -22,12 +22,13 @@ ; CHECK-LABEL: test_urem_even: ; CHECK: @ %bb.0: ; CHECK-NEXT: movw r1, #28087 -; CHECK-NEXT: bic r0, r0, #-134217728 -; CHECK-NEXT: movt r1, #46811 -; CHECK-NEXT: movw r2, #9363 +; CHECK-NEXT: movw r2, #18725 +; CHECK-NEXT: movt r1, #1755 +; CHECK-NEXT: movt r2, #146 ; CHECK-NEXT: muls r0, r1, r0 -; CHECK-NEXT: movt r2, #4681 -; CHECK-NEXT: ror.w r1, r0, #1 +; CHECK-NEXT: ubfx r1, r0, #1, #26 +; CHECK-NEXT: orr.w r0, r1, r0, lsl #26 +; CHECK-NEXT: bic r1, r0, #-134217728 ; CHECK-NEXT: movs r0, #0 ; CHECK-NEXT: cmp r1, r2 ; CHECK-NEXT: it lo @@ -41,12 +42,11 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; CHECK-LABEL: test_urem_odd_setne: ; CHECK: @ %bb.0: -; CHECK-NEXT: movw r1, #52429 -; CHECK-NEXT: and r0, r0, #15 -; CHECK-NEXT: movt r1, #52428 -; CHECK-NEXT: muls r1, r0, r1 +; CHECK-NEXT: movs r1, #13 +; CHECK-NEXT: muls r0, r1, r0 +; CHECK-NEXT: and r1, r0, #15 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: cmp.w r1, #858993459 +; CHECK-NEXT: cmp r1, #3 ; CHECK-NEXT: it hi ; CHECK-NEXT: movhi r0, #1 ; CHECK-NEXT: bx lr @@ -58,14 +58,11 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { ; CHECK-LABEL: test_urem_negative_odd: ; CHECK: @ %bb.0: -; CHECK-NEXT: movw r1, #57651 -; CHECK-NEXT: bfc r0, #9, #23 -; CHECK-NEXT: movt r1, #43302 -; CHECK-NEXT: movw r2, #17191 +; CHECK-NEXT: movw r1, #307 ; CHECK-NEXT: muls r1, r0, r1 ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: movt r2, #129 -; CHECK-NEXT: cmp r1, r2 +; CHECK-NEXT: bfc r1, #9, #23 +; CHECK-NEXT: cmp r1, #1 ; CHECK-NEXT: it hi ; CHECK-NEXT: movhi r0, #1 ; CHECK-NEXT: bx lr @@ -77,50 +74,54 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; CHECK-LABEL: test_urem_vec: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: movw r3, #18725 -; CHECK-NEXT: bfc r1, #11, #21 -; CHECK-NEXT: movt r3, #9362 -; CHECK-NEXT: bfc r2, #11, #21 -; CHECK-NEXT: umull r3, r12, r1, r3 -; CHECK-NEXT: bfc r0, #11, #21 -; CHECK-NEXT: movw r3, #25663 -; CHECK-NEXT: movt r3, #160 -; CHECK-NEXT: umull r3, lr, r2, r3 -; CHECK-NEXT: vldr d17, .LCPI4_0 -; CHECK-NEXT: movw r3, #43691 -; CHECK-NEXT: movt r3, #43690 -; CHECK-NEXT: umull r3, r4, r0, r3 -; CHECK-NEXT: sub.w r3, r1, r12 -; CHECK-NEXT: add.w r3, r12, r3, lsr #1 -; CHECK-NEXT: lsr.w r12, r3, #2 -; CHECK-NEXT: sub.w r3, r2, lr -; CHECK-NEXT: lsrs r4, r4, #2 -; CHECK-NEXT: add.w r4, r4, r4, lsl #1 -; CHECK-NEXT: add.w r3, lr, r3, lsr #1 -; CHECK-NEXT: sub.w r0, r0, r4, lsl #1 -; CHECK-NEXT: lsr.w lr, r3, #10 -; CHECK-NEXT: movw r3, #2043 ; CHECK-NEXT: vmov.16 d16[0], r0 -; CHECK-NEXT: sub.w r0, r12, r12, lsl #3 -; CHECK-NEXT: mls r2, lr, r3, r2 -; CHECK-NEXT: add r0, r1 -; CHECK-NEXT: vmov.16 d16[1], r0 +; CHECK-NEXT: vldr d17, .LCPI4_0 +; CHECK-NEXT: vmov.16 d16[1], r1 +; CHECK-NEXT: vldr d19, .LCPI4_3 ; CHECK-NEXT: vmov.16 d16[2], r2 +; CHECK-NEXT: vsub.i16 d16, d16, d17 +; CHECK-NEXT: vldr d17, .LCPI4_1 +; CHECK-NEXT: vmul.i16 d16, d16, d17 +; CHECK-NEXT: vldr d17, .LCPI4_2 +; CHECK-NEXT: vneg.s16 d17, d17 +; CHECK-NEXT: vshl.i16 d18, d16, #1 +; CHECK-NEXT: vbic.i16 d16, #0xf800 +; CHECK-NEXT: vshl.u16 d16, d16, d17 +; CHECK-NEXT: vshl.u16 d17, d18, d19 +; CHECK-NEXT: vorr d16, d16, d17 +; CHECK-NEXT: vldr d17, .LCPI4_4 ; CHECK-NEXT: vbic.i16 d16, #0xf800 -; CHECK-NEXT: vceq.i16 d16, d16, d17 -; CHECK-NEXT: vmvn d16, d16 +; CHECK-NEXT: vcgt.u16 d16, d16, d17 ; CHECK-NEXT: vmov.u16 r0, d16[0] ; CHECK-NEXT: vmov.u16 r1, d16[1] ; CHECK-NEXT: vmov.u16 r2, d16[2] -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 3 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI4_0: ; CHECK-NEXT: .short 0 @ 0x0 ; CHECK-NEXT: .short 1 @ 0x1 ; CHECK-NEXT: .short 2 @ 0x2 +; CHECK-NEXT: .zero 2 +; CHECK-NEXT: .LCPI4_1: +; CHECK-NEXT: .short 683 @ 0x2ab +; CHECK-NEXT: .short 1463 @ 0x5b7 +; CHECK-NEXT: .short 819 @ 0x333 +; CHECK-NEXT: .zero 2 +; CHECK-NEXT: .LCPI4_2: +; CHECK-NEXT: .short 1 @ 0x1 +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .LCPI4_3: +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .LCPI4_4: +; CHECK-NEXT: .short 341 @ 0x155 +; CHECK-NEXT: .short 292 @ 0x124 +; CHECK-NEXT: .short 1 @ 0x1 ; CHECK-NEXT: .short 0 @ 0x0 %urem = urem <3 x i11> %X, %cmp = icmp ne <3 x i11> %urem, diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -83,33 +83,29 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: psrld $2, %xmm2 -; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm2 -; SSE2-NEXT: psubd %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: p4_vector_urem_by_const__splat: ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; SSE4-NEXT: pmuludq %xmm2, %xmm1 -; SSE4-NEXT: pmuludq %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; SSE4-NEXT: psrld $2, %xmm2 -; SSE4-NEXT: pmaddwd {{.*}}(%rip), %xmm2 -; SSE4-NEXT: psubd %xmm2, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE4-NEXT: psrld $1, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; SSE4-NEXT: pminud %xmm0, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE4-NEXT: retq ; @@ -117,16 +113,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpmaddwd {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, ; clearly a power-of-two or zero @@ -140,59 +131,50 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrld $2, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] -; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm2 -; SSE2-NEXT: psubd %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm2 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: p5_vector_urem_by_const__nonsplat: ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177] -; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE4-NEXT: pmuludq %xmm2, %xmm3 -; SSE4-NEXT: pmuludq %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; SSE4-NEXT: movdqa %xmm1, %xmm2 -; SSE4-NEXT: psrld $2, %xmm2 -; SSE4-NEXT: psrld $1, %xmm1 -; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] -; SSE4-NEXT: pmaddwd {{.*}}(%rip), %xmm1 -; SSE4-NEXT: psubd %xmm1, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; SSE4-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; SSE4-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; SSE4-NEXT: por %xmm2, %xmm1 +; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [1431655765,858993459,715827882,477218588] +; SSE4-NEXT: pminud %xmm1, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX2-LABEL: p5_vector_urem_by_const__nonsplat: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpmaddwd {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, @@ -206,39 +188,32 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: psrld $2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: psubd %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: p6_vector_urem_by_const__nonsplat_undef0: ; SSE4: # %bb.0: ; SSE4-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; SSE4-NEXT: pmuludq %xmm2, %xmm1 -; SSE4-NEXT: pmuludq %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; SSE4-NEXT: psrld $2, %xmm2 -; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm2 -; SSE4-NEXT: psubd %xmm2, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrld $1, %xmm1 +; SSE4-NEXT: pslld $31, %xmm0 +; SSE4-NEXT: por %xmm1, %xmm0 +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; SSE4-NEXT: pminud %xmm0, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE4-NEXT: retq ; @@ -246,17 +221,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -10,22 +10,19 @@ define i1 @test_srem_odd(i29 %X) nounwind { ; X86-LABEL: test_srem_odd: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll $3, %eax -; X86-NEXT: sarl $3, %eax -; X86-NEXT: imull $-1084587701, %eax, %eax # imm = 0xBF5A814B -; X86-NEXT: addl $21691754, %eax # imm = 0x14AFD6A -; X86-NEXT: cmpl $43383509, %eax # imm = 0x295FAD5 +; X86-NEXT: imull $526025035, {{[0-9]+}}(%esp), %eax # imm = 0x1F5A814B +; X86-NEXT: addl $2711469, %eax # imm = 0x295FAD +; X86-NEXT: andl $536870911, %eax # imm = 0x1FFFFFFF +; X86-NEXT: cmpl $5422939, %eax # imm = 0x52BF5B ; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: test_srem_odd: ; X64: # %bb.0: -; X64-NEXT: shll $3, %edi -; X64-NEXT: sarl $3, %edi -; X64-NEXT: imull $-1084587701, %edi, %eax # imm = 0xBF5A814B -; X64-NEXT: addl $21691754, %eax # imm = 0x14AFD6A -; X64-NEXT: cmpl $43383509, %eax # imm = 0x295FAD5 +; X64-NEXT: imull $526025035, %edi, %eax # imm = 0x1F5A814B +; X64-NEXT: addl $2711469, %eax # imm = 0x295FAD +; X64-NEXT: andl $536870911, %eax # imm = 0x1FFFFFFF +; X64-NEXT: cmpl $5422939, %eax # imm = 0x52BF5B ; X64-NEXT: setb %al ; X64-NEXT: retq %srem = srem i29 %X, 99 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -9,122 +9,80 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,1374389535,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,3264175145,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,25,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm5 -; CHECK-SSE41-NEXT: psrad $1, %xmm5 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3],xmm5[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: paddd %xmm5, %xmm3 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,171798690,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm5 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1],xmm2[2,3] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -270,112 +228,60 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: pslld $31, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrad $3, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -396,113 +302,60 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = <1,u,4294967295,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm6 -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm6 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm6 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pandn %xmm3, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: pslld $31, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrad $3, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -526,135 +379,80 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,4294967295,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -675,135 +473,80 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,4294967295,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -829,103 +572,73 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483649,1717986919] -; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE2-NEXT: psrad $1, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = <1717986919,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm4 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 -; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 -; CHECK-SSE41-NEXT: psrad $3, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 -; CHECK-SSE41-NEXT: psrad $1, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm4 -; CHECK-SSE41-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,858993458,268435454,858993458] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm4 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919] -; CHECK-AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm4 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -949,88 +662,73 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483649,2454267027] -; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 -; CHECK-SSE2-NEXT: retq -; -; CHECK-SSE41-LABEL: test_srem_even_poweroftwo: -; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,268435454,306783378] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1054,122 +752,80 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,2147483649,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,16,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm5 -; CHECK-SSE41-NEXT: psrad $1, %xmm5 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3],xmm5[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: paddd %xmm5, %xmm3 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,268435454,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm5 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1255,101 +911,60 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: pslld $31, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1372,129 +987,80 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2454267027,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1603,111 +1169,99 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483647,2454267027] -; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295,0,4294967295] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3067833783,u,1,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 -; CHECK-SSE2-NEXT: psrad $3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 -; CHECK-SSE2-NEXT: psrad $30, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <2147483648,u,2,u> +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483647,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrad $30, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE41-NEXT: psrad $3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3067833783,3067833783,1,3067833783] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: por %xmm5, %xmm4 +; CHECK-SSE41-NEXT: pminud %xmm4, %xmm3 +; CHECK-SSE41-NEXT: pcmpeqd %xmm4, %xmm3 +; CHECK-SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] +; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm2, %xmm4 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpminud %xmm3, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] +; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1735,135 +1289,103 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,2147483647,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $30, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,2147483648,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $30, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE41-NEXT: psrad $1, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,2147483648,2,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm4 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm3 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,0,42949672] +; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2147483648,2,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm4 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] +; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1892,133 +1414,79 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,0,1,3435973837] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1717986919] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $1, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,5] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3],xmm4[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,858993458] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 @@ -2038,121 +1506,80 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,2147483649,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4294967295,1,1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 -; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm5 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,16,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,0,1,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,4294967295,1,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,4294967295,268435454,306783378] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2175,134 +1602,80 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,0,1,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $1, %xmm3 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $3, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE41-NEXT: psrad $1, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2388,120 +1761,60 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,0,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,0,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4294967295,1,1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 -; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] -; CHECK-SSE2-NEXT: psrld $31, %xmm5 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,1,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: pslld $31, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,4294967295,1,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2524,127 +1837,80 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,0,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $1, %xmm3 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $1, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2669,124 +1935,80 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1717986919] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3435973837] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $1, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $3, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,5] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,858993458] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2809,109 +2031,80 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2147483649,0,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,1,0,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrad $3, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,268435454,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2934,129 +2127,80 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -3080,108 +2224,68 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,0,2147483649,0] -; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE2-NEXT: psrlq $32, %xmm2 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $1, %xmm4 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: psrlq $32, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $1, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm3 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -3203,102 +2307,68 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,0,4294967295,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2454267027,0,2147483649,0] -; CHECK-SSE2-NEXT: pand %xmm6, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm6 -; CHECK-SSE2-NEXT: psrlq $32, %xmm6 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm6 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $31, %xmm6 -; CHECK-SSE2-NEXT: pand %xmm3, %xmm6 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm6 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,16,1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: psrlq $32, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm3 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,4294967295,268435454,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -71,87 +71,60 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: psrad $5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pslld $30, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pslld $30, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -239,87 +212,60 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_neg100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2920577761,1374389535,2920577761,1374389535] -; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pslld $30, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_neg100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2920577761,u,2920577761,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pslld $30, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_neg100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_neg100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2920577761,2920577761,2920577761,2920577761] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -10,20 +10,17 @@ define i1 @test_urem_odd(i13 %X) nounwind { ; X86-LABEL: test_urem_odd: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $3277, {{[0-9]+}}(%esp), %eax # imm = 0xCCD ; X86-NEXT: andl $8191, %eax # imm = 0x1FFF -; X86-NEXT: imull $-13107, %eax, %eax # imm = 0xCCCD -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: cmpl $13108, %eax # imm = 0x3334 +; X86-NEXT: cmpl $1639, %eax # imm = 0x667 ; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd: ; X64: # %bb.0: -; X64-NEXT: andl $8191, %edi # imm = 0x1FFF -; X64-NEXT: imull $-13107, %edi, %eax # imm = 0xCCCD -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: cmpl $13108, %eax # imm = 0x3334 +; X64-NEXT: imull $3277, %edi, %eax # imm = 0xCCD +; X64-NEXT: andl $8191, %eax # imm = 0x1FFF +; X64-NEXT: cmpl $1639, %eax # imm = 0x667 ; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i13 %X, 5 @@ -34,20 +31,27 @@ define i1 @test_urem_even(i27 %X) nounwind { ; X86-LABEL: test_urem_even: ; X86: # %bb.0: -; X86-NEXT: movl $134217727, %eax # imm = 0x7FFFFFF -; X86-NEXT: andl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull $-1227133513, %eax, %eax # imm = 0xB6DB6DB7 -; X86-NEXT: rorl %eax -; X86-NEXT: cmpl $306783379, %eax # imm = 0x12492493 +; X86-NEXT: imull $115043767, {{[0-9]+}}(%esp), %eax # imm = 0x6DB6DB7 +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shll $26, %ecx +; X86-NEXT: andl $134217726, %eax # imm = 0x7FFFFFE +; X86-NEXT: shrl %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF +; X86-NEXT: cmpl $9586981, %eax # imm = 0x924925 ; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_even: ; X64: # %bb.0: -; X64-NEXT: andl $134217727, %edi # imm = 0x7FFFFFF -; X64-NEXT: imull $-1227133513, %edi, %eax # imm = 0xB6DB6DB7 -; X64-NEXT: rorl %eax -; X64-NEXT: cmpl $306783379, %eax # imm = 0x12492493 +; X64-NEXT: imull $115043767, %edi, %eax # imm = 0x6DB6DB7 +; X64-NEXT: movl %eax, %ecx +; X64-NEXT: shll $26, %ecx +; X64-NEXT: andl $134217726, %eax # imm = 0x7FFFFFE +; X64-NEXT: shrl %eax +; X64-NEXT: orl %ecx, %eax +; X64-NEXT: andl $134217727, %eax # imm = 0x7FFFFFF +; X64-NEXT: cmpl $9586981, %eax # imm = 0x924925 ; X64-NEXT: setb %al ; X64-NEXT: retq %urem = urem i27 %X, 14 @@ -58,20 +62,21 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; X86-LABEL: test_urem_odd_setne: ; X86: # %bb.0: -; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: leal (%eax,%eax,2), %ecx +; X86-NEXT: leal (%eax,%ecx,4), %eax ; X86-NEXT: andb $15, %al -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: imull $-51, %eax, %eax -; X86-NEXT: cmpb $51, %al +; X86-NEXT: cmpb $3, %al ; X86-NEXT: seta %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_odd_setne: ; X64: # %bb.0: -; X64-NEXT: andb $15, %dil -; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: imull $-51, %eax, %eax -; X64-NEXT: cmpb $51, %al +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal (%rdi,%rdi,2), %eax +; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: andb $15, %al +; X64-NEXT: cmpb $3, %al ; X64-NEXT: seta %al ; X64-NEXT: retq %urem = urem i4 %X, 5 @@ -82,20 +87,17 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { ; X86-LABEL: test_urem_negative_odd: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $307, {{[0-9]+}}(%esp), %eax # imm = 0x133 ; X86-NEXT: andl $511, %eax # imm = 0x1FF -; X86-NEXT: imull $-7885, %eax, %eax # imm = 0xE133 -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: cmpl $129, %eax +; X86-NEXT: cmpw $1, %ax ; X86-NEXT: seta %al ; X86-NEXT: retl ; ; X64-LABEL: test_urem_negative_odd: ; X64: # %bb.0: -; X64-NEXT: andl $511, %edi # imm = 0x1FF -; X64-NEXT: imull $-7885, %edi, %eax # imm = 0xE133 -; X64-NEXT: movzwl %ax, %eax -; X64-NEXT: cmpl $129, %eax +; X64-NEXT: imull $307, %edi, %eax # imm = 0x133 +; X64-NEXT: andl $511, %eax # imm = 0x1FF +; X64-NEXT: cmpw $1, %ax ; X64-NEXT: seta %al ; X64-NEXT: retq %urem = urem i9 %X, -5 @@ -106,67 +108,55 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind { ; X86-LABEL: test_urem_vec: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl $2047, %edx # imm = 0x7FF -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: imull $683, {{[0-9]+}}(%esp), %eax # imm = 0x2AB +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shll $10, %ecx +; X86-NEXT: andl $2046, %eax # imm = 0x7FE +; X86-NEXT: shrl %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: andl $2047, %eax # imm = 0x7FF -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $2047, %ecx # imm = 0x7FF -; X86-NEXT: imull $-5325, %ecx, %ecx # imm = 0xEB33 -; X86-NEXT: addl $10650, %ecx # imm = 0x299A -; X86-NEXT: cmpw $32, %cx -; X86-NEXT: seta %cl -; X86-NEXT: imull $-21845, %eax, %eax # imm = 0xAAAB -; X86-NEXT: rorw %ax -; X86-NEXT: movzwl %ax, %eax -; X86-NEXT: cmpl $10922, %eax # imm = 0x2AAA +; X86-NEXT: cmpl $341, %eax # imm = 0x155 ; X86-NEXT: seta %al -; X86-NEXT: imull $28087, %edx, %edx # imm = 0x6DB7 -; X86-NEXT: addl $-28087, %edx # imm = 0x9249 -; X86-NEXT: movzwl %dx, %edx -; X86-NEXT: cmpl $9362, %edx # imm = 0x2492 +; X86-NEXT: imull $1463, {{[0-9]+}}(%esp), %ecx # imm = 0x5B7 +; X86-NEXT: addl $-1463, %ecx # imm = 0xFA49 +; X86-NEXT: andl $2047, %ecx # imm = 0x7FF +; X86-NEXT: cmpl $292, %ecx # imm = 0x124 ; X86-NEXT: seta %dl +; X86-NEXT: imull $819, {{[0-9]+}}(%esp), %ecx # imm = 0x333 +; X86-NEXT: addl $-1638, %ecx # imm = 0xF99A +; X86-NEXT: andl $2047, %ecx # imm = 0x7FF +; X86-NEXT: cmpw $1, %cx +; X86-NEXT: seta %cl ; X86-NEXT: retl ; ; SSE2-LABEL: test_urem_vec: ; SSE2: # %bb.0: -; SSE2-NEXT: movl %esi, %eax -; SSE2-NEXT: andl $2047, %eax # imm = 0x7FF -; SSE2-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: subl %ecx, %eax -; SSE2-NEXT: movzwl %ax, %eax -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: addl %ecx, %eax -; SSE2-NEXT: shrl $2, %eax -; SSE2-NEXT: leal (,%rax,8), %ecx -; SSE2-NEXT: subl %ecx, %eax -; SSE2-NEXT: addl %esi, %eax -; SSE2-NEXT: andl $2047, %edi # imm = 0x7FF -; SSE2-NEXT: imull $43691, %edi, %ecx # imm = 0xAAAB -; SSE2-NEXT: shrl $17, %ecx -; SSE2-NEXT: andl $-2, %ecx -; SSE2-NEXT: leal (%rcx,%rcx,2), %ecx -; SSE2-NEXT: subl %ecx, %edi -; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: pinsrw $2, %eax, %xmm0 -; SSE2-NEXT: movl %edx, %eax -; SSE2-NEXT: andl $2047, %eax # imm = 0x7FF -; SSE2-NEXT: imull $161, %eax, %ecx -; SSE2-NEXT: shrl $16, %ecx -; SSE2-NEXT: subl %ecx, %eax -; SSE2-NEXT: movzwl %ax, %eax -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: addl %ecx, %eax -; SSE2-NEXT: shrl $10, %eax -; SSE2-NEXT: imull $2043, %eax, %eax # imm = 0x7FB -; SSE2-NEXT: subl %eax, %edx -; SSE2-NEXT: pinsrw $4, %edx, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: movd %edi, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: psubd {{.*}}(%rip), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = <683,1463,819,u> +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2047,2047,2047,2047] +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm2[0],xmm3[1,2,3] +; SSE2-NEXT: pslld $10, %xmm1 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: andps %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm2 +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %dl ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %cl @@ -174,45 +164,25 @@ ; ; SSE41-LABEL: test_urem_vec: ; SSE41: # %bb.0: -; SSE41-NEXT: movl %esi, %eax -; SSE41-NEXT: andl $2047, %eax # imm = 0x7FF -; SSE41-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; SSE41-NEXT: shrl $16, %ecx -; SSE41-NEXT: subl %ecx, %eax -; SSE41-NEXT: movzwl %ax, %eax -; SSE41-NEXT: shrl %eax -; SSE41-NEXT: addl %ecx, %eax -; SSE41-NEXT: shrl $2, %eax -; SSE41-NEXT: leal (,%rax,8), %ecx -; SSE41-NEXT: subl %ecx, %eax -; SSE41-NEXT: addl %esi, %eax -; SSE41-NEXT: andl $2047, %edi # imm = 0x7FF -; SSE41-NEXT: imull $43691, %edi, %ecx # imm = 0xAAAB -; SSE41-NEXT: shrl $17, %ecx -; SSE41-NEXT: andl $-2, %ecx -; SSE41-NEXT: leal (%rcx,%rcx,2), %ecx -; SSE41-NEXT: subl %ecx, %edi ; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pinsrw $2, %eax, %xmm0 -; SSE41-NEXT: movl %edx, %eax -; SSE41-NEXT: andl $2047, %eax # imm = 0x7FF -; SSE41-NEXT: imull $161, %eax, %ecx -; SSE41-NEXT: shrl $16, %ecx -; SSE41-NEXT: subl %ecx, %eax -; SSE41-NEXT: movzwl %ax, %eax -; SSE41-NEXT: shrl %eax -; SSE41-NEXT: addl %ecx, %eax -; SSE41-NEXT: shrl $10, %eax -; SSE41-NEXT: imull $2043, %eax, %eax # imm = 0x7FB -; SSE41-NEXT: subl %eax, %edx -; SSE41-NEXT: pinsrw $4, %edx, %xmm0 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: pextrb $4, %xmm1, %edx -; SSE41-NEXT: pextrb $8, %xmm1, %ecx +; SSE41-NEXT: pinsrd $1, %esi, %xmm0 +; SSE41-NEXT: pinsrd $2, %edx, %xmm0 +; SSE41-NEXT: psubd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: psrld $1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: pslld $10, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7] +; SSE41-NEXT: por %xmm2, %xmm3 +; SSE41-NEXT: pand %xmm1, %xmm3 +; SSE41-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; SSE41-NEXT: movd %xmm3, %eax +; SSE41-NEXT: pextrb $4, %xmm3, %edx +; SSE41-NEXT: pextrb $8, %xmm3, %ecx ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: # kill: def $dl killed $dl killed $edx ; SSE41-NEXT: # kill: def $cl killed $cl killed $ecx @@ -220,42 +190,21 @@ ; ; AVX1-LABEL: test_urem_vec: ; AVX1: # %bb.0: -; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: andl $2047, %eax # imm = 0x7FF -; AVX1-NEXT: imull $9363, %eax, %ecx # imm = 0x2493 -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $2, %eax -; AVX1-NEXT: leal (,%rax,8), %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: addl %esi, %eax -; AVX1-NEXT: andl $2047, %edi # imm = 0x7FF -; AVX1-NEXT: imull $43691, %edi, %ecx # imm = 0xAAAB -; AVX1-NEXT: shrl $17, %ecx -; AVX1-NEXT: andl $-2, %ecx -; AVX1-NEXT: leal (%rcx,%rcx,2), %ecx -; AVX1-NEXT: subl %ecx, %edi ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 -; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: andl $2047, %eax # imm = 0x7FF -; AVX1-NEXT: imull $161, %eax, %ecx -; AVX1-NEXT: shrl $16, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: movzwl %ax, %eax -; AVX1-NEXT: shrl %eax -; AVX1-NEXT: addl %ecx, %eax -; AVX1-NEXT: shrl $10, %eax -; AVX1-NEXT: imull $2043, %eax, %eax # imm = 0x7FB -; AVX1-NEXT: subl %eax, %edx -; AVX1-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2047,2047,2047,2047] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3,4,5,6,7] +; AVX1-NEXT: vpslld $10, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vpextrb $4, %xmm0, %edx ; AVX1-NEXT: vpextrb $8, %xmm0, %ecx @@ -266,43 +215,18 @@ ; ; AVX2-LABEL: test_urem_vec: ; AVX2: # %bb.0: -; AVX2-NEXT: andl $2047, %esi # imm = 0x7FF -; AVX2-NEXT: imull $9363, %esi, %eax # imm = 0x2493 -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movl %esi, %ecx -; AVX2-NEXT: subl %eax, %ecx -; AVX2-NEXT: movzwl %cx, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: shrl $2, %ecx -; AVX2-NEXT: leal (,%rcx,8), %eax -; AVX2-NEXT: subl %eax, %ecx -; AVX2-NEXT: addl %esi, %ecx -; AVX2-NEXT: andl $2047, %edi # imm = 0x7FF -; AVX2-NEXT: imull $43691, %edi, %eax # imm = 0xAAAB -; AVX2-NEXT: shrl $17, %eax -; AVX2-NEXT: andl $-2, %eax -; AVX2-NEXT: leal (%rax,%rax,2), %eax -; AVX2-NEXT: subl %eax, %edi ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: andl $2047, %edx # imm = 0x7FF -; AVX2-NEXT: imull $161, %edx, %eax -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: movl %edx, %ecx -; AVX2-NEXT: subl %eax, %ecx -; AVX2-NEXT: movzwl %cx, %ecx -; AVX2-NEXT: shrl %ecx -; AVX2-NEXT: addl %eax, %ecx -; AVX2-NEXT: shrl $10, %ecx -; AVX2-NEXT: imull $2043, %ecx, %eax # imm = 0x7FB -; AVX2-NEXT: subl %eax, %edx -; AVX2-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2047,2047,2047,2047] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 +; AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %eax ; AVX2-NEXT: vpextrb $4, %xmm0, %edx ; AVX2-NEXT: vpextrb $8, %xmm0, %ecx @@ -313,40 +237,17 @@ ; ; AVX512VL-LABEL: test_urem_vec: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: andl $2047, %esi # imm = 0x7FF -; AVX512VL-NEXT: imull $9363, %esi, %eax # imm = 0x2493 -; AVX512VL-NEXT: shrl $16, %eax -; AVX512VL-NEXT: movl %esi, %ecx -; AVX512VL-NEXT: subl %eax, %ecx -; AVX512VL-NEXT: movzwl %cx, %ecx -; AVX512VL-NEXT: shrl %ecx -; AVX512VL-NEXT: addl %eax, %ecx -; AVX512VL-NEXT: shrl $2, %ecx -; AVX512VL-NEXT: leal (,%rcx,8), %eax -; AVX512VL-NEXT: subl %eax, %ecx -; AVX512VL-NEXT: addl %esi, %ecx -; AVX512VL-NEXT: andl $2047, %edi # imm = 0x7FF -; AVX512VL-NEXT: imull $43691, %edi, %eax # imm = 0xAAAB -; AVX512VL-NEXT: shrl $17, %eax -; AVX512VL-NEXT: andl $-2, %eax -; AVX512VL-NEXT: leal (%rax,%rax,2), %eax -; AVX512VL-NEXT: subl %eax, %edi ; AVX512VL-NEXT: vmovd %edi, %xmm0 -; AVX512VL-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0 -; AVX512VL-NEXT: andl $2047, %edx # imm = 0x7FF -; AVX512VL-NEXT: imull $161, %edx, %eax -; AVX512VL-NEXT: shrl $16, %eax -; AVX512VL-NEXT: movl %edx, %ecx -; AVX512VL-NEXT: subl %eax, %ecx -; AVX512VL-NEXT: movzwl %cx, %ecx -; AVX512VL-NEXT: shrl %ecx -; AVX512VL-NEXT: addl %eax, %ecx -; AVX512VL-NEXT: shrl $10, %ecx -; AVX512VL-NEXT: imull $2043, %ecx, %eax # imm = 0x7FB -; AVX512VL-NEXT: subl %eax, %edx -; AVX512VL-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512VL-NEXT: vpcmpneqd {{.*}}(%rip), %xmm0, %k0 +; AVX512VL-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm1 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2047,2047,2047,2047] +; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogd $200, %xmm1, %xmm2, %xmm0 +; AVX512VL-NEXT: vpcmpnleud {{.*}}(%rip), %xmm0, %k0 ; AVX512VL-NEXT: kshiftrw $1, %k0, %k1 ; AVX512VL-NEXT: kmovw %k1, %edx ; AVX512VL-NEXT: kshiftrw $2, %k0, %k1 diff --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll @@ -295,17 +295,19 @@ define i1 @t64_3_2(i64 %X) nounwind { ; X86-LABEL: t64_3_2: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __umoddi3 -; X86-NEXT: addl $16, %esp -; X86-NEXT: xorl $2, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: sete %al -; X86-NEXT: addl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %edx # imm = 0xAAAAAAAB +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: adcl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: sbbl $1431655765, %edx # imm = 0x55555555 +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t64_3_2: diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -9,98 +9,71 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,3264175145,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: psrld $3, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,25,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,1374389535,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,171798691,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,1374389535,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -197,84 +170,64 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_eq: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -295,87 +248,66 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783379,306783379,2,306783379] +; CHECK-SSE41-NEXT: pmaxud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_ne: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne: @@ -396,98 +328,71 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,4294967295,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,4294967295,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -508,101 +413,73 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,4294967295,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,4294967295,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993460,306783379,2,42949673] +; CHECK-SSE41-NEXT: pmaxud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne: @@ -625,73 +502,64 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,268435455,858993459] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -714,81 +582,64 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,268435455,306783378] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -811,95 +662,71 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,268435456,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,16,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,268435456,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,268435455,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,268435456,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -979,81 +806,54 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_one: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pslld $31, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_one: ; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_one: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1076,93 +876,71 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,14,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,0,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,0,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1187,73 +965,64 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,1,858993459] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1276,81 +1045,64 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_INT_MIN: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1373,95 +1125,71 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,2,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,2147483648,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1486,89 +1214,66 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,3435973837] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,16,5] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,858993459] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1591,98 +1296,71 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,14] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,4294967295,1,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,1,268435456,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,268435455,306783378] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1705,93 +1383,71 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,268435456,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,100] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psrld $5, %xmm4 -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1855,95 +1511,71 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,1,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,4294967295,0,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,1,1,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,1,2147483648] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,1,1,2147483648] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1966,88 +1598,71 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE2-NEXT: psrld $5, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -2072,83 +1687,71 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3435973837] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,268435456,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,1,5] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,858993459] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -2171,92 +1774,71 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,1,0,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,268435456,1,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,268435455,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,0,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -2279,85 +1861,71 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE2-NEXT: psrld $5, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,16,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,268435456,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -2381,90 +1949,66 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,1] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -2486,97 +2030,66 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: psrld $1, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [14,4294967295,16,1] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,4294967295,1,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,268435455,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll @@ -119,67 +119,59 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_6_part0: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $1, %xmm0 +; CHECK-SSE2-NEXT: pslld $31, %xmm3 +; CHECK-SSE2-NEXT: por %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_6_part0: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_6_part0: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_6_part0: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_6_part0: @@ -198,67 +190,58 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_6_part1: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $1, %xmm0 +; CHECK-SSE2-NEXT: pslld $31, %xmm3 +; CHECK-SSE2-NEXT: por %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_6_part1: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [715827881,715827881,715827882,715827882] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_6_part1: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_6_part1: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_6_part1: @@ -277,71 +260,49 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_tautological: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2,3] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_tautological: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: psrld $1, %xmm3 -; CHECK-SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm0, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_tautological: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_tautological: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_tautological: diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -65,73 +65,55 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pslld $30, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pslld $30, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -196,74 +178,51 @@ ; CHECK-SSE2-LABEL: test_urem_even_neg100: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: psrld $27, %xmm2 -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pslld $30, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_neg100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psrld $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $27, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pslld $30, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,42949672,1,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_neg100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $27, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_neg100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [536870925,536870925,536870925,536870925] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq