diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5253,7 +5253,7 @@ EVT ShSVT = ShVT.getScalarType(); // If MUL is unavailable, we cannot proceed in any case. - if (!isOperationLegalOrCustom(ISD::MUL, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::MUL, VT)) return SDValue(); bool ComparingWithAllZeros = true; @@ -5383,7 +5383,7 @@ } if (!ComparingWithAllZeros && !AllComparisonsWithNonZerosAreTautological) { - if (!isOperationLegalOrCustom(ISD::SUB, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::SUB, VT)) return SDValue(); // FIXME: Could/should use `ISD::ADD`? assert(CompTargetNode.getValueType() == N.getValueType() && "Expecting that the types on LHS and RHS of comparisons match."); @@ -5398,7 +5398,7 @@ // divisors as a performance improvement, since rotating by 0 is a no-op. if (HadEvenDivisor) { // We need ROTR to do this. - if (!isOperationLegalOrCustom(ISD::ROTR, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ROTR, VT)) return SDValue(); SDNodeFlags Flags; Flags.setExact(true); @@ -5438,7 +5438,7 @@ } // Else, we can just invert the comparison result in the appropriate lanes. - if (isOperationLegalOrCustom(ISD::XOR, SETCCVT)) + if (!DCI.isBeforeLegalizeOps() && isOperationLegalOrCustom(ISD::XOR, SETCCVT)) return DAG.getNode(ISD::XOR, DL, SETCCVT, NewCC, TautologicalInvertedChannels); @@ -5493,7 +5493,7 @@ EVT ShSVT = ShVT.getScalarType(); // If MUL is unavailable, we cannot proceed in any case. - if (!isOperationLegalOrCustom(ISD::MUL, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::MUL, VT)) return SDValue(); // TODO: Could support comparing with non-zero too. @@ -5638,7 +5638,7 @@ if (NeedToApplyOffset) { // We need ADD to do this. - if (!isOperationLegalOrCustom(ISD::ADD, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ADD, VT)) return SDValue(); // (add (mul N, P), A) @@ -5650,7 +5650,7 @@ // divisors as a performance improvement, since rotating by 0 is a no-op. if (HadEvenDivisor) { // We need ROTR to do this. - if (!isOperationLegalOrCustom(ISD::ROTR, VT)) + if (!DCI.isBeforeLegalizeOps() && !isOperationLegalOrCustom(ISD::ROTR, VT)) return SDValue(); SDNodeFlags Flags; Flags.setExact(true); diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -9,20 +9,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: adrp x8, .LCPI0_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] -; CHECK-NEXT: adrp x8, .LCPI0_2 -; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] ; CHECK-NEXT: adrp x8, .LCPI0_3 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_3] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: adrp x8, .LCPI0_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: usra v3.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -82,27 +80,19 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: adrp x8, .LCPI3_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: adrp x8, .LCPI3_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_3] -; CHECK-NEXT: adrp x8, .LCPI3_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_4] -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: movk w8, #46811, lsl #16 +; CHECK-NEXT: movk w9, #4681, lsl #16 +; CHECK-NEXT: adrp x10, .LCPI3_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI3_0] +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v0.4s, v2.4s, #31 +; CHECK-NEXT: ushr v1.4s, v2.4s, #1 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -114,28 +104,19 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_ne: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: adrp x8, .LCPI4_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] -; CHECK-NEXT: adrp x8, .LCPI4_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] -; CHECK-NEXT: adrp x8, .LCPI4_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI4_3] -; CHECK-NEXT: adrp x8, .LCPI4_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_4] -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: movk w8, #46811, lsl #16 +; CHECK-NEXT: movk w9, #4681, lsl #16 +; CHECK-NEXT: adrp x10, .LCPI4_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI4_0] +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v0.4s, v2.4s, #31 +; CHECK-NEXT: ushr v1.4s, v2.4s, #1 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhi v0.4s, v0.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -153,23 +134,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] ; CHECK-NEXT: adrp x8, .LCPI5_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] -; CHECK-NEXT: adrp x8, .LCPI5_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] ; CHECK-NEXT: adrp x8, .LCPI5_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_3] +; CHECK-NEXT: adrp x8, .LCPI5_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI5_2] ; CHECK-NEXT: adrp x8, .LCPI5_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -185,24 +161,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0] ; CHECK-NEXT: adrp x8, .LCPI6_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] -; CHECK-NEXT: adrp x8, .LCPI6_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2] ; CHECK-NEXT: adrp x8, .LCPI6_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI6_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_3] +; CHECK-NEXT: adrp x8, .LCPI6_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI6_2] ; CHECK-NEXT: adrp x8, .LCPI6_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -222,20 +192,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] ; CHECK-NEXT: adrp x8, .LCPI7_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1] -; CHECK-NEXT: adrp x8, .LCPI7_2 -; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2] ; CHECK-NEXT: adrp x8, .LCPI7_3 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_3] +; CHECK-NEXT: adrp x8, .LCPI7_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI7_2] +; CHECK-NEXT: adrp x8, .LCPI7_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: usra v3.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -253,14 +221,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] ; CHECK-NEXT: adrp x8, .LCPI8_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1] -; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: add v1.4s, v1.4s, v0.4s -; CHECK-NEXT: sshr v3.4s, v1.4s, #3 -; CHECK-NEXT: usra v3.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: adrp x8, .LCPI8_3 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI8_3] +; CHECK-NEXT: adrp x8, .LCPI8_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI8_2] +; CHECK-NEXT: adrp x8, .LCPI8_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -278,20 +250,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] ; CHECK-NEXT: adrp x8, .LCPI9_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] -; CHECK-NEXT: adrp x8, .LCPI9_2 -; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] ; CHECK-NEXT: adrp x8, .LCPI9_3 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_3] +; CHECK-NEXT: adrp x8, .LCPI9_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI9_2] +; CHECK-NEXT: adrp x8, .LCPI9_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: usra v3.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -330,25 +300,19 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_one: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: adrp x8, .LCPI11_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1] -; CHECK-NEXT: adrp x8, .LCPI11_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2] -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: adrp x8, .LCPI11_3 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: add v1.4s, v1.4s, v0.4s -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: movk w8, #46811, lsl #16 +; CHECK-NEXT: movk w9, #4681, lsl #16 +; CHECK-NEXT: adrp x10, .LCPI11_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI11_0] +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v0.4s, v2.4s, #31 +; CHECK-NEXT: ushr v1.4s, v2.4s, #1 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -366,24 +330,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] ; CHECK-NEXT: adrp x8, .LCPI12_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] -; CHECK-NEXT: adrp x8, .LCPI12_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] ; CHECK-NEXT: adrp x8, .LCPI12_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_3] +; CHECK-NEXT: adrp x8, .LCPI12_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI12_2] ; CHECK-NEXT: adrp x8, .LCPI12_4 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -495,23 +453,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] ; CHECK-NEXT: adrp x8, .LCPI16_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] -; CHECK-NEXT: adrp x8, .LCPI16_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] ; CHECK-NEXT: adrp x8, .LCPI16_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI16_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_3] +; CHECK-NEXT: adrp x8, .LCPI16_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI16_2] ; CHECK-NEXT: adrp x8, .LCPI16_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -529,23 +482,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] ; CHECK-NEXT: adrp x8, .LCPI17_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1] -; CHECK-NEXT: adrp x8, .LCPI17_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2] ; CHECK-NEXT: adrp x8, .LCPI17_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI17_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_3] +; CHECK-NEXT: adrp x8, .LCPI17_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI17_2] ; CHECK-NEXT: adrp x8, .LCPI17_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -563,23 +511,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] ; CHECK-NEXT: adrp x8, .LCPI18_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] -; CHECK-NEXT: adrp x8, .LCPI18_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] ; CHECK-NEXT: adrp x8, .LCPI18_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: adrp x8, .LCPI18_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI18_2] ; CHECK-NEXT: adrp x8, .LCPI18_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -618,27 +561,19 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_allones_and_one: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI20_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] -; CHECK-NEXT: adrp x8, .LCPI20_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1] -; CHECK-NEXT: adrp x8, .LCPI20_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2] -; CHECK-NEXT: adrp x8, .LCPI20_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_3] -; CHECK-NEXT: adrp x8, .LCPI20_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_4] -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w9, #9362 +; CHECK-NEXT: movk w8, #46811, lsl #16 +; CHECK-NEXT: movk w9, #4681, lsl #16 +; CHECK-NEXT: adrp x10, .LCPI20_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: ldr q3, [x10, :lo12:.LCPI20_0] +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v0.4s, v2.4s, #31 +; CHECK-NEXT: ushr v1.4s, v2.4s, #1 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -656,23 +591,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] ; CHECK-NEXT: adrp x8, .LCPI21_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_1] -; CHECK-NEXT: adrp x8, .LCPI21_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2] ; CHECK-NEXT: adrp x8, .LCPI21_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_3] +; CHECK-NEXT: adrp x8, .LCPI21_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI21_2] ; CHECK-NEXT: adrp x8, .LCPI21_4 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI21_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -692,24 +622,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] ; CHECK-NEXT: adrp x8, .LCPI22_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_1] -; CHECK-NEXT: adrp x8, .LCPI22_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2] ; CHECK-NEXT: adrp x8, .LCPI22_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: adrp x8, .LCPI22_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI22_2] ; CHECK-NEXT: adrp x8, .LCPI22_4 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI22_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -727,21 +651,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] ; CHECK-NEXT: adrp x8, .LCPI23_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1] -; CHECK-NEXT: adrp x8, .LCPI23_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2] -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s ; CHECK-NEXT: adrp x8, .LCPI23_3 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: add v1.4s, v1.4s, v0.4s -; CHECK-NEXT: sshl v2.4s, v1.4s, v2.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_3] +; CHECK-NEXT: adrp x8, .LCPI23_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI23_2] +; CHECK-NEXT: adrp x8, .LCPI23_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -759,24 +680,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] ; CHECK-NEXT: adrp x8, .LCPI24_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_1] -; CHECK-NEXT: adrp x8, .LCPI24_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2] ; CHECK-NEXT: adrp x8, .LCPI24_3 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: adrp x8, .LCPI24_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI24_2] ; CHECK-NEXT: adrp x8, .LCPI24_4 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI24_4] +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_4] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: sshl v3.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -795,22 +710,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_0] ; CHECK-NEXT: adrp x8, .LCPI25_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_1] -; CHECK-NEXT: adrp x8, .LCPI25_2 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI25_2] ; CHECK-NEXT: adrp x8, .LCPI25_3 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI25_3] -; CHECK-NEXT: neg v4.4s, v4.4s -; CHECK-NEXT: movi v3.2d, #0x000000ffffffff -; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_3] +; CHECK-NEXT: adrp x8, .LCPI25_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI25_2] +; CHECK-NEXT: adrp x8, .LCPI25_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -827,22 +738,18 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_0] ; CHECK-NEXT: adrp x8, .LCPI26_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] -; CHECK-NEXT: adrp x8, .LCPI26_2 -; CHECK-NEXT: smull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI26_2] ; CHECK-NEXT: adrp x8, .LCPI26_3 -; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_3] -; CHECK-NEXT: neg v4.4s, v4.4s -; CHECK-NEXT: movi v3.2d, #0x000000ffffffff -; CHECK-NEXT: sshl v4.4s, v1.4s, v4.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #31 -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_3] +; CHECK-NEXT: adrp x8, .LCPI26_2 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI26_2] +; CHECK-NEXT: adrp x8, .LCPI26_4 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_4] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ushl v2.4s, v2.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -29,17 +29,20 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 -; CHECK-NEXT: movk w8, #20971, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: smull2 v3.2d, v0.4s, v2.4s -; CHECK-NEXT: smull v2.2d, v0.2s, v2.2s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: sshr v3.4s, v2.4s, #5 -; CHECK-NEXT: movi v1.4s, #100 -; CHECK-NEXT: usra v3.4s, v2.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: mov w9, #47184 +; CHECK-NEXT: movk w8, #49807, lsl #16 +; CHECK-NEXT: movk w9, #1310, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mov w10, #23592 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: movk w10, #655, lsl #16 +; CHECK-NEXT: shl v0.4s, v2.4s, #30 +; CHECK-NEXT: ushr v1.4s, v2.4s, #2 +; CHECK-NEXT: dup v3.4s, w10 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -79,17 +82,20 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_neg100: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI3_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: smull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: sshr v3.4s, v1.4s, #5 -; CHECK-NEXT: usra v3.4s, v1.4s, #31 -; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: mov w9, #47184 +; CHECK-NEXT: movk w8, #49807, lsl #16 +; CHECK-NEXT: movk w9, #1310, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: mov w10, #23592 +; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: movk w10, #655, lsl #16 +; CHECK-NEXT: shl v0.4s, v2.4s, #30 +; CHECK-NEXT: ushr v1.4s, v2.4s, #2 +; CHECK-NEXT: dup v3.4s, w10 +; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -83,18 +83,13 @@ define i16 @test_srem_even(i16 %X) nounwind { ; CHECK-LABEL: test_srem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #9363 -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: movk w9, #37449, lsl #16 -; CHECK-NEXT: smull x9, w8, w9 -; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: add w8, w9, w8 -; CHECK-NEXT: asr w9, w8, #3 -; CHECK-NEXT: add w8, w9, w8, lsr #31 -; CHECK-NEXT: mov w9, #14 -; CHECK-NEXT: msub w8, w8, w9, w0 -; CHECK-NEXT: tst w8, #0xffff -; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mov w9, #4680 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: lsl w10, w8, #15 +; CHECK-NEXT: bfxil w10, w8, #1, #15 +; CHECK-NEXT: cmp w9, w10, uxth +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %srem = srem i16 %X, 14 %cmp = icmp ne i16 %srem, 0 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll @@ -195,14 +195,12 @@ define i1 @t16_3_2(i16 %X) nounwind { ; CHECK-LABEL: t16_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: mov w10, #-1431655766 -; CHECK-NEXT: madd w8, w8, w9, w10 -; CHECK-NEXT: mov w9, #1431655765 -; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: mov w8, #-21845 +; CHECK-NEXT: mov w9, #-21846 +; CHECK-NEXT: madd w8, w0, w8, w9 +; CHECK-NEXT: mov w9, #21845 +; CHECK-NEXT: cmp w9, w8, uxth +; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret %urem = urem i16 %X, 3 %cmp = icmp eq i16 %urem, 2 @@ -212,13 +210,11 @@ define i1 @t8_3_2(i8 %X) nounwind { ; CHECK-LABEL: t8_3_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #43691 -; CHECK-NEXT: and w8, w0, #0xff -; CHECK-NEXT: movk w9, #43690, lsl #16 -; CHECK-NEXT: mov w10, #-1431655766 -; CHECK-NEXT: madd w8, w8, w9, w10 -; CHECK-NEXT: mov w9, #1431655765 -; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: mov w8, #-85 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: sub w8, w8, #86 // =86 +; CHECK-NEXT: and w8, w8, #0xff +; CHECK-NEXT: cmp w8, #85 // =85 ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %urem = urem i8 %X, 3 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -11,17 +11,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] ; CHECK-NEXT: adrp x8, .LCPI0_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI0_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -79,17 +76,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] ; CHECK-NEXT: adrp x8, .LCPI3_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI3_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -107,18 +101,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_1] ; CHECK-NEXT: adrp x8, .LCPI4_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI4_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI4_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI4_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -138,17 +128,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] ; CHECK-NEXT: adrp x8, .LCPI5_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI5_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -166,18 +153,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_1] ; CHECK-NEXT: adrp x8, .LCPI6_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI6_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI6_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI6_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: mvn v0.16b, v0.16b +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -199,13 +182,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI7_1] ; CHECK-NEXT: adrp x8, .LCPI7_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI7_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: adrp x8, .LCPI7_3 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI7_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -225,17 +209,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_1] ; CHECK-NEXT: adrp x8, .LCPI8_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI8_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI8_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI8_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -255,17 +236,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] ; CHECK-NEXT: adrp x8, .LCPI9_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI9_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -301,26 +279,16 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_one: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI11_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI11_0] -; CHECK-NEXT: adrp x8, .LCPI11_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_1] -; CHECK-NEXT: adrp x8, .LCPI11_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI11_2] -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: adrp x8, .LCPI11_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI11_3] -; CHECK-NEXT: adrp x8, .LCPI11_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI11_4] -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: movk w8, #46811, lsl #16 +; CHECK-NEXT: adrp x9, .LCPI11_0 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI11_0] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v1.4s, v0.4s, #31 +; CHECK-NEXT: ushr v0.4s, v0.4s, #1 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -340,20 +308,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_1] ; CHECK-NEXT: adrp x8, .LCPI12_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI12_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI12_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI12_3] -; CHECK-NEXT: adrp x8, .LCPI12_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI12_4] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -375,13 +337,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_1] ; CHECK-NEXT: adrp x8, .LCPI13_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI13_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: adrp x8, .LCPI13_3 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -401,17 +364,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] ; CHECK-NEXT: adrp x8, .LCPI14_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI14_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI14_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI14_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -431,17 +391,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] ; CHECK-NEXT: adrp x8, .LCPI15_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI15_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI15_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI15_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -463,13 +420,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1] ; CHECK-NEXT: adrp x8, .LCPI16_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI16_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: adrp x8, .LCPI16_3 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -489,17 +447,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_1] ; CHECK-NEXT: adrp x8, .LCPI17_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI17_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI17_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI17_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -519,13 +474,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI18_1] ; CHECK-NEXT: adrp x8, .LCPI18_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI18_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: adrp x8, .LCPI18_3 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -566,20 +522,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_1] ; CHECK-NEXT: adrp x8, .LCPI20_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI20_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI20_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI20_3] -; CHECK-NEXT: adrp x8, .LCPI20_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_4] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -600,15 +550,13 @@ ; CHECK-NEXT: adrp x8, .LCPI21_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI21_2] ; CHECK-NEXT: adrp x8, .LCPI21_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI21_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -631,15 +579,13 @@ ; CHECK-NEXT: adrp x8, .LCPI22_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI22_2] ; CHECK-NEXT: adrp x8, .LCPI22_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI22_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -659,20 +605,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_1] ; CHECK-NEXT: adrp x8, .LCPI23_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI23_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI23_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI23_3] -; CHECK-NEXT: adrp x8, .LCPI23_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI23_4] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -693,15 +633,13 @@ ; CHECK-NEXT: adrp x8, .LCPI24_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI24_2] ; CHECK-NEXT: adrp x8, .LCPI24_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI24_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI24_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -723,15 +661,13 @@ ; CHECK-NEXT: adrp x8, .LCPI25_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI25_2] ; CHECK-NEXT: adrp x8, .LCPI25_3 -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI25_3] -; CHECK-NEXT: neg v2.4s, v2.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI25_3] +; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -750,20 +686,14 @@ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] ; CHECK-NEXT: adrp x8, .LCPI26_2 ; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI26_2] -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI26_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI26_3] -; CHECK-NEXT: adrp x8, .LCPI26_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI26_4] +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI26_3] ; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bit v1.16b, v0.16b, v2.16b -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: ushl v2.4s, v0.4s, v2.4s +; CHECK-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -45,18 +45,20 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-LABEL: t32_6_part0: ; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, .LCPI2_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: mov w8, #43691 ; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: adrp x9, .LCPI2_0 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI2_0] -; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #2 -; CHECK-NEXT: movi v3.4s, #6 -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s +; CHECK-NEXT: dup v2.4s, w8 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov w9, #43690 +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: movk w9, #10922, lsl #16 +; CHECK-NEXT: shl v1.4s, v0.4s, #31 +; CHECK-NEXT: ushr v0.4s, v0.4s, #1 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, w9 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, @@ -67,18 +69,19 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind { ; CHECK-LABEL: t32_6_part1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: adrp x9, .LCPI3_0 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI3_0] -; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v1.4s, v1.4s, #2 -; CHECK-NEXT: movi v3.4s, #6 -; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s +; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] +; CHECK-NEXT: mov w9, #43691 +; CHECK-NEXT: movk w9, #43690, lsl #16 +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s +; CHECK-NEXT: shl v1.4s, v0.4s, #31 +; CHECK-NEXT: ushr v0.4s, v0.4s, #1 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmhs v0.4s, v3.4s, v0.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-splat.ll @@ -26,16 +26,17 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_urem_even_100: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #34079 -; CHECK-NEXT: movk w8, #20971, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s -; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: movi v1.4s, #100 -; CHECK-NEXT: ushr v2.4s, v2.4s, #5 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mov w8, #23593 +; CHECK-NEXT: movk w8, #49807, lsl #16 +; CHECK-NEXT: dup v1.4s, w8 +; CHECK-NEXT: mov w9, #23592 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: movk w9, #655, lsl #16 +; CHECK-NEXT: shl v1.4s, v0.4s, #30 +; CHECK-NEXT: ushr v0.4s, v0.4s, #2 +; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -74,19 +75,11 @@ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: adrp x8, .LCPI3_1 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] -; CHECK-NEXT: adrp x8, .LCPI3_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: neg v1.4s, v1.4s -; CHECK-NEXT: adrp x8, .LCPI3_3 -; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 +; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s +; CHECK-NEXT: shl v1.4s, v0.4s, #30 +; CHECK-NEXT: ushr v0.4s, v0.4s, #2 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmhs v0.4s, v2.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-tautological.ll @@ -22,14 +22,23 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI1_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s -; CHECK-NEXT: cmhs v0.4s, v1.4s, v0.4s +; CHECK-NEXT: adrp x8, .LCPI1_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] +; CHECK-NEXT: adrp x8, .LCPI1_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] +; CHECK-NEXT: adrp x8, .LCPI1_3 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI1_3] +; CHECK-NEXT: adrp x8, .LCPI1_4 +; CHECK-NEXT: umull2 v5.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v5.4s +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI1_4] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s ; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: movi d1, #0xffff0000ffff0000 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp eq <4 x i32> %urem, @@ -41,14 +50,24 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: movk w8, #43690, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: mul v0.4s, v0.4s, v2.4s -; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s +; CHECK-NEXT: adrp x8, .LCPI2_1 +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: adrp x8, .LCPI2_2 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] +; CHECK-NEXT: adrp x8, .LCPI2_3 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI2_3] +; CHECK-NEXT: adrp x8, .LCPI2_4 +; CHECK-NEXT: umull2 v5.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s +; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v5.4s +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI2_4] +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v1.16b, v0.16b, v3.16b +; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, v5.4s +; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: movi d1, #0xffff0000ffff0000 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, %cmp = icmp ne <4 x i32> %urem, @@ -60,13 +79,25 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: mov w8, #43691 -; CHECK-NEXT: dup v2.8h, w8 -; CHECK-NEXT: mul v0.8h, v0.8h, v2.8h -; CHECK-NEXT: cmhs v0.8h, v1.8h, v0.8h +; CHECK-NEXT: adrp x8, .LCPI3_1 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: adrp x8, .LCPI3_2 +; CHECK-NEXT: umull2 v4.4s, v0.8h, v1.8h +; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h +; CHECK-NEXT: uzp2 v1.8h, v1.8h, v4.8h +; CHECK-NEXT: neg v3.8h, v3.8h +; CHECK-NEXT: movi v2.2d, #0xffff00000000ffff +; CHECK-NEXT: ushl v1.8h, v1.8h, v3.8h +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] +; CHECK-NEXT: adrp x8, .LCPI3_3 +; CHECK-NEXT: movi v4.2d, #0x00ffffffff0000 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] +; CHECK-NEXT: and v4.16b, v0.16b, v4.16b +; CHECK-NEXT: orr v1.16b, v4.16b, v1.16b +; CHECK-NEXT: mls v0.8h, v1.8h, v3.8h +; CHECK-NEXT: cmeq v0.8h, v0.8h, v2.8h ; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: movi d1, #0xffff0000ffff0000 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <8 x i16> %X, %cmp = icmp eq <8 x i16> %urem, @@ -77,19 +108,18 @@ ; CHECK-LABEL: t3_wide: ; CHECK: // %bb.0: ; CHECK-NEXT: mov x9, #-6148914691236517206 -; CHECK-NEXT: adrp x11, .LCPI4_0 -; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov x8, d0 ; CHECK-NEXT: movk x9, #43691 -; CHECK-NEXT: fmov x10, d0 -; CHECK-NEXT: ldr q0, [x11, :lo12:.LCPI4_0] -; CHECK-NEXT: mul x10, x10, x9 -; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: fmov d1, x10 -; CHECK-NEXT: mov v1.d[1], x8 -; CHECK-NEXT: cmhs v0.2d, v0.2d, v1.2d +; CHECK-NEXT: adrp x10, .LCPI4_0 +; CHECK-NEXT: umulh x9, x8, x9 +; CHECK-NEXT: ldr q0, [x10, :lo12:.LCPI4_0] +; CHECK-NEXT: lsr x9, x9, #1 +; CHECK-NEXT: add x9, x9, x9, lsl #1 +; CHECK-NEXT: sub x8, x8, x9 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: mov v1.d[0], x8 +; CHECK-NEXT: cmeq v0.2d, v1.2d, v0.2d ; CHECK-NEXT: xtn v0.2s, v0.2d -; CHECK-NEXT: movi d1, #0xffffffff00000000 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %urem = urem <2 x i64> %X, %cmp = icmp eq <2 x i64> %urem, diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll --- a/llvm/test/CodeGen/AArch64/urem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -78,14 +78,13 @@ define i16 @test_urem_even(i16 %X) nounwind { ; CHECK-LABEL: test_urem_even: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w9, #28087 -; CHECK-NEXT: and w8, w0, #0xffff -; CHECK-NEXT: movk w9, #46811, lsl #16 -; CHECK-NEXT: mul w8, w8, w9 -; CHECK-NEXT: mov w9, #9362 -; CHECK-NEXT: ror w8, w8, #1 -; CHECK-NEXT: movk w9, #4681, lsl #16 -; CHECK-NEXT: cmp w8, w9 +; CHECK-NEXT: mov w8, #28087 +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: and w9, w8, #0xfffc +; CHECK-NEXT: lsr w9, w9, #1 +; CHECK-NEXT: bfi w9, w8, #15, #17 +; CHECK-NEXT: ubfx w8, w9, #1, #15 +; CHECK-NEXT: cmp w8, #2340 // =2340 ; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret %urem = urem i16 %X, 14 diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -81,17 +81,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] ; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; CHECK-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpsrld $1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %t0 = and <4 x i32> %x, ; clearly a power-of-two or zero @@ -104,17 +98,11 @@ ; CHECK-LABEL: p5_vector_urem_by_const__nonsplat: ; CHECK: # %bb.0: ; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,954437177] -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %t0 = and <4 x i32> %x, @@ -128,17 +116,13 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] ; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; CHECK-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %t0 = and <4 x i32> %x, diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -9,122 +9,80 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,1374389535,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,3264175145,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,25,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm5 -; CHECK-SSE41-NEXT: psrad $1, %xmm5 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3],xmm5[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: paddd %xmm5, %xmm3 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,171798690,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm5 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1],xmm2[2,3] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -270,112 +228,60 @@ define <4 x i32> @test_srem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm5 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: pslld $31, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrad $3, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -396,113 +302,60 @@ define <4 x i32> @test_srem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = <1,u,4294967295,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm6 -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm6 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm6 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pandn %xmm3, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: pslld $31, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrad $3, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -526,135 +379,80 @@ define <4 x i32> @test_srem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,4294967295,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -675,135 +473,80 @@ define <4 x i32> @test_srem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,4294967295,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -829,103 +572,73 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,1717986919,2147483649,1717986919] -; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE2-NEXT: psrad $1, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = <1717986919,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm4 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm4 -; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm2 -; CHECK-SSE41-NEXT: psrad $3, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm3 -; CHECK-SSE41-NEXT: psrad $1, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm4 -; CHECK-SSE41-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,858993458,268435454,858993458] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm4 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm2, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1717986919,1717986919,1717986919,1717986919] -; CHECK-AVX2-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm4 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm2, %xmm3 -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -949,88 +662,73 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483649,2454267027] -; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 -; CHECK-SSE2-NEXT: retq -; -; CHECK-SSE41-LABEL: test_srem_even_poweroftwo: -; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: retq +; +; CHECK-SSE41-LABEL: test_srem_even_poweroftwo: +; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,268435454,306783378] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1054,122 +752,80 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1717986919,2454267027,2147483649,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,4294967295,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0] -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,16,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm5 -; CHECK-SSE41-NEXT: psrad $1, %xmm5 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3],xmm5[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: paddd %xmm5, %xmm3 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,268435454,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm5 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483649,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1255,101 +911,60 @@ define <4 x i32> @test_srem_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,0,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: pslld $31, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,0,0,0] -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm0, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1372,129 +987,80 @@ define <4 x i32> @test_srem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2454267027,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1603,111 +1169,99 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 ; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2454267027,2147483647,2454267027] -; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295,0,4294967295] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3067833783,u,1,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = <1,u,4294967295,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm2 -; CHECK-SSE2-NEXT: psrad $3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm4, %xmm3 -; CHECK-SSE2-NEXT: psrad $30, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = <2147483648,u,2,u> +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pxor %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483647,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,4294967295,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrad $30, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE41-NEXT: psrad $3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: paddd %xmm3, %xmm1 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3067833783,3067833783,1,3067833783] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] +; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; CHECK-SSE41-NEXT: por %xmm5, %xmm4 +; CHECK-SSE41-NEXT: pminud %xmm4, %xmm3 +; CHECK-SSE41-NEXT: pcmpeqd %xmm4, %xmm3 +; CHECK-SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] +; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; CHECK-AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [306783378,306783378,0,306783378] +; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm2, %xmm4 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpminud %xmm3, %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] +; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1735,135 +1289,103 @@ define <4 x i32> @test_srem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4294967295,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,2454267027,2147483647,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $30, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,14,2147483648,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm5, %xmm3 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0] +; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] +; CHECK-SSE2-NEXT: psrld $31, %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4294967295,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $30, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE41-NEXT: psrad $1, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,2147483648,2,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm4 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm4 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm5 +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm3 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [858993458,306783378,0,42949672] +; CHECK-SSE41-NEXT: pminud %xmm3, %xmm2 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm2 +; CHECK-SSE41-NEXT: pand {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $30, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2147483648,2,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm4 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm4, %xmm4 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3 +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm2, %xmm3 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2454267027,2147483647,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm2, %xmm3 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] +; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -1892,133 +1414,79 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,0,1,3435973837] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1717986919] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <1,u,268435456,u> +; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $1, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,5] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm4, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1,2,3],xmm4[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,858993458] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1717986919] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: retq -; +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: retq +; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_and_poweroftwo: ; CHECK-AVX512VL: # %bb.0: ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 @@ -2038,121 +1506,80 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,2147483649,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4294967295,1,1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 -; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm5 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,16,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,0,1,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,4294967295,1,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,4294967295,268435454,306783378] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,2147483649,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2175,134 +1602,80 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,0,1,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,2147483649,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $1, %xmm3 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $3, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE41-NEXT: psrad $1, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3],xmm4[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,2147483649,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2388,120 +1761,60 @@ define <4 x i32> @test_srem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,0,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,0,0,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,4294967295,1,1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm5 -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm6, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 -; CHECK-SSE2-NEXT: movdqa %xmm5, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[1,2] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1] -; CHECK-SSE2-NEXT: psrld $31, %xmm5 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,1,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm5 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $1, %xmm1 +; CHECK-SSE2-NEXT: pslld $31, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,4294967295,1,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,4294967295,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,0,0,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2524,127 +1837,80 @@ define <4 x i32> @test_srem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,0,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1717986919,0,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm1 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $1, %xmm3 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,1,0] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $1, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3,4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,0,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2669,124 +1935,80 @@ define <4 x i32> @test_srem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1717986919] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3435973837] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $1, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $3, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,5] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,858993458] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1717986919] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2809,109 +2031,80 @@ define <4 x i32> @test_srem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295,0,4294967295] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2454267027,2147483649,0,2454267027] -; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm5, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm0, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm5 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm3 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,1,0,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] ; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrad $3, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,268435454,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -2934,129 +2127,80 @@ define <4 x i32> @test_srem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,2147483649,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,0,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4294967295,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm5 -; CHECK-SSE2-NEXT: psrad $1, %xmm5 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,16,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3,4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE41-NEXT: psrad $3, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm1 -; CHECK-SSE41-NEXT: psrad $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,268435454,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3,4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm4 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1717986919,2147483649,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuldq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2],xmm2[3] -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -3080,108 +2224,68 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1717986919,0,2147483649,0] -; CHECK-SSE2-NEXT: pand %xmm2, %xmm4 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE2-NEXT: psrlq $32, %xmm2 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrad $3, %xmm3 -; CHECK-SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm4 -; CHECK-SSE2-NEXT: psrad $1, %xmm4 -; CHECK-SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [5,4294967295,16,1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,1,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <1717986919,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: psrlq $32, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrad $1, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 ; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm3 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993458,4294967295,268435454,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm3, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -3203,102 +2307,68 @@ define <4 x i32> @test_srem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1] +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,0,4294967295,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm5, %xmm5 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2454267027,0,2147483649,0] -; CHECK-SSE2-NEXT: pand %xmm6, %xmm5 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm5 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm6 -; CHECK-SSE2-NEXT: psrlq $32, %xmm6 -; CHECK-SSE2-NEXT: psubd %xmm5, %xmm6 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm4 -; CHECK-SSE2-NEXT: psrad $3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $31, %xmm6 -; CHECK-SSE2-NEXT: pand %xmm3, %xmm6 -; CHECK-SSE2-NEXT: paddd %xmm4, %xmm6 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,16,1] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm6 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm1, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,4294967295,1,1] -; CHECK-SSE41-NEXT: pmulld %xmm0, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2454267027,u,2147483649,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: psrlq $32, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrad $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm2 -; CHECK-SSE41-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm3 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,4294967295,268435454,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm3, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm3, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpsrlq $32, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3] -; CHECK-AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -71,87 +71,60 @@ define <4 x i32> @test_srem_even_100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: pxor %xmm3, %xmm3 -; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; CHECK-SSE2-NEXT: pand %xmm1, %xmm4 -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $31, %xmm1 -; CHECK-SSE2-NEXT: psrad $5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pslld $30, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-SSE41-NEXT: pmuldq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pslld $30, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -239,87 +212,60 @@ define <4 x i32> @test_srem_even_neg100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_srem_even_neg100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pxor %xmm2, %xmm2 -; CHECK-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2920577761,1374389535,2920577761,1374389535] -; CHECK-SSE2-NEXT: pand %xmm3, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,0,4294967295,0] -; CHECK-SSE2-NEXT: pand %xmm0, %xmm4 -; CHECK-SSE2-NEXT: paddd %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm4, %xmm2 -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: psrad $5, %xmm2 -; CHECK-SSE2-NEXT: paddd %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pslld $30, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_srem_even_neg100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuldq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <2920577761,u,2920577761,u> -; CHECK-SSE41-NEXT: pmuldq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrad $5, %xmm2 -; CHECK-SSE41-NEXT: paddd %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: paddd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pslld $30, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_srem_even_neg100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuldq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpsrad $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_even_neg100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2920577761,2920577761,2920577761,2920577761] -; CHECK-AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344] +; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll @@ -295,17 +295,19 @@ define i1 @t64_3_2(i64 %X) nounwind { ; X86-LABEL: t64_3_2: ; X86: # %bb.0: -; X86-NEXT: subl $12, %esp -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $3 -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: calll __umoddi3 -; X86-NEXT: addl $16, %esp -; X86-NEXT: xorl $2, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: sete %al -; X86-NEXT: addl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edx +; X86-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %edx # imm = 0xAAAAAAAB +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA +; X86-NEXT: adcl $-1431655766, %edx # imm = 0xAAAAAAAA +; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 +; X86-NEXT: sbbl $1431655765, %edx # imm = 0x55555555 +; X86-NEXT: setb %al ; X86-NEXT: retl ; ; X64-LABEL: t64_3_2: diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -9,98 +9,71 @@ define <4 x i32> @test_urem_odd_even(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,3264175145,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: psrld $3, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,25,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,1374389535,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,171798691,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,1374389535,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $3, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -197,84 +170,64 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_eq: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -295,87 +248,66 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783379,306783379,2,306783379] +; CHECK-SSE41-NEXT: pmaxud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_ne: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_even_allones_ne: @@ -396,98 +328,71 @@ define <4 x i32> @test_urem_odd_even_allones_eq(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_eq: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,4294967295,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,4294967295,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_eq: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -508,101 +413,73 @@ define <4 x i32> @test_urem_odd_even_allones_ne(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_ne: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,4294967295,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: psrld $31, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,4294967295,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: psrld $31, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_ne: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $31, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pandn {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993460,306783379,2,42949673] +; CHECK-SSE41-NEXT: pmaxud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 +; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpandn {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2147483649,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_odd_even_allones_ne: @@ -625,73 +502,64 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,268435456,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo: ; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,268435456,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,268435455,858993459] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -714,81 +582,64 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,268435455,306783378] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -811,95 +662,71 @@ define <4 x i32> @test_urem_odd_even_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,268435456,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,16,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,268435456,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,268435455,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,268435456,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,268435456,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,268435456,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -979,81 +806,54 @@ define <4 x i32> @test_urem_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_one: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pslld $31, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_one: ; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 ; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [306783378,306783378,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_one: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1076,93 +876,71 @@ define <4 x i32> @test_urem_odd_even_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: movaps %xmm0, %xmm3 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,14,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,0,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,0,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,1,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1187,73 +965,64 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = <3435973837,u,2,u> -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN: ; CHECK-SSE41: # %bb.0: +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = <3435973837,u,2,u> -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,858993459,1,858993459] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_INT_MIN: ; CHECK-AVX1: # %bb.0: +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1276,81 +1045,64 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm2, %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,306783378,1,306783378] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_INT_MIN: ; CHECK-AVX2: # %bb.0: +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2454267027,2454267027,2454267027,2454267027] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1373,95 +1125,71 @@ define <4 x i32> @test_urem_odd_even_INT_MIN(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,3067833783,1,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2147483648,2,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm3 -; CHECK-SSE2-NEXT: psrld $2, %xmm3 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,14,2147483648,100] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [3435973837,2454267027,2,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,306783378,1,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [3435973837,2454267027,2,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2147483648,2,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_INT_MIN: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2454267027,2,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1486,89 +1214,66 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,3435973837] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [5,4294967295,16,5] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,858993459] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,3435973837] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1591,98 +1296,71 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [14,4294967295,16,14] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm2[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,4294967295,1,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,1,268435456,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,268435455,306783378] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,1,268435456,2147483648] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,268435456,2454267027] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm3, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1705,93 +1383,71 @@ define <4 x i32> @test_urem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,268435456,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,100] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm4 -; CHECK-SSE2-NEXT: psrld $5, %xmm4 -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm4[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,268435456,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_poweroftwo: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1855,95 +1511,71 @@ define <4 x i32> @test_urem_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: psrld $31, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,4294967295,1,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,4294967295,0,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,1,1,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: psrld $31, %xmm4 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: psubd %xmm4, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,1,1,2147483648] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,2147483649,0,2454267027] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,1,1,2147483648] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,0,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1966,88 +1598,71 @@ define <4 x i32> @test_urem_odd_even_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE2-NEXT: psrld $5, %xmm2 -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,1,1,1073741824] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1073741824] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -2072,83 +1687,71 @@ define <4 x i32> @test_urem_odd_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3435973837] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,268435456,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [5,16,1,5] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,858993459] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,3435973837] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -2171,92 +1774,71 @@ define <4 x i32> @test_urem_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: psrld $1, %xmm1 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [14,16,1,14] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm4, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,1,0,3067833783] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,268435456,1,2147483648] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm4 -; CHECK-SSE41-NEXT: pmuludq %xmm3, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,268435455,4294967295,306783378] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483648,268435456,1,2147483648] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2454267027,268435456,0,2454267027] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,268435456,0,2454267027] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -2279,85 +1861,71 @@ define <4 x i32> @test_urem_odd_even_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE2-NEXT: psrld $5, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm2[3,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,16,1,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm0, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,1,0,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,268435456,1,1073741824] +; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824] ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: movdqa %xmm3, %xmm2 -; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-SSE41-NEXT: por %xmm1, %xmm2 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,268435455,4294967295,42949672] +; CHECK-SSE41-NEXT: pminud %xmm2, %xmm0 +; CHECK-SSE41-NEXT: pcmpeqd %xmm2, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,268435456,1,1073741824] ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm2, %xmm3 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_even_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,268435456,0,1374389535] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -2381,90 +1949,66 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,4294967295,1,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: movdqa %xmm2, %xmm1 -; CHECK-SSE2-NEXT: psrld $2, %xmm1 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [5,4294967295,16,1] ; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: psubd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [858993459,1,268435455,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_odd_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3435973837,2147483649,268435456,0] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -2486,97 +2030,66 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm4 -; CHECK-SSE2-NEXT: psrld $1, %xmm4 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm4 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [14,4294967295,16,1] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: psrld $31, %xmm3 -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3067833783,4294967295,1,0] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm3, %xmm1 +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1] +; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: por %xmm3, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE41-NEXT: psrld $1, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $2, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm3 -; CHECK-SSE41-NEXT: psrld $31, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm0[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm3 -; CHECK-SSE41-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-SSE41-NEXT: por %xmm2, %xmm1 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [306783378,1,268435455,4294967295] +; CHECK-SSE41-NEXT: pminud %xmm1, %xmm0 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2147483649,268435456,0] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX1-NEXT: vpsrld $31, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; CHECK-AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_allones_and_poweroftwo_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [2454267027,2147483649,268435456,0] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm3 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll @@ -119,67 +119,59 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_6_part0: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $1, %xmm0 +; CHECK-SSE2-NEXT: pslld $31, %xmm3 +; CHECK-SSE2-NEXT: por %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_6_part0: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_6_part0: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_6_part0: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] +; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_6_part0: @@ -198,67 +190,58 @@ define <4 x i1> @t32_6_part1(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_6_part1: ; CHECK-SSE2: # %bb.0: +; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; CHECK-SSE2-NEXT: movdqa %xmm3, %xmm0 +; CHECK-SSE2-NEXT: psrld $1, %xmm0 +; CHECK-SSE2-NEXT: pslld $31, %xmm3 +; CHECK-SSE2-NEXT: por %xmm0, %xmm3 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm3 +; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; CHECK-SSE2-NEXT: pxor %xmm3, %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_6_part1: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $1, %xmm1 +; CHECK-SSE41-NEXT: pslld $31, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [715827881,715827881,715827882,715827882] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_6_part1: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_6_part1: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_6_part1: @@ -277,71 +260,49 @@ define <4 x i1> @t32_tautological(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: t32_tautological: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 +; CHECK-SSE2-NEXT: psubd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: psrld $1, %xmm3 -; CHECK-SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,1,2,3] -; CHECK-SSE2-NEXT: movapd %xmm2, %xmm3 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm3, %xmm0 -; CHECK-SSE2-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: t32_tautological: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE41-NEXT: psrld $1, %xmm3 -; CHECK-SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: psubd {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295,4294967295,1431655764] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqd %xmm0, %xmm1 +; CHECK-SSE41-NEXT: pxor %xmm0, %xmm0 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: t32_tautological: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,2147483648,2863311531] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: t32_tautological: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3] -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpcmpeqd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: t32_tautological: diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -65,73 +65,55 @@ define <4 x i32> @test_urem_even_100(<4 x i32> %X) nounwind { ; CHECK-SSE2-LABEL: test_urem_even_100: ; CHECK-SSE2: # %bb.0: -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrld $5, %xmm2 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [100,100,100,100] -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: pmuludq %xmm1, %xmm3 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pslld $30, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-SSE41-NEXT: pmuludq %xmm2, %xmm1 -; CHECK-SSE41-NEXT: pmuludq %xmm0, %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: psrld $5, %xmm2 -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pslld $30, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] +; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] +; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -196,74 +178,51 @@ ; CHECK-SSE2-LABEL: test_urem_even_neg100: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE2-NEXT: psrld $5, %xmm1 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE2-NEXT: psrld $2, %xmm2 -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE2-NEXT: psrld $27, %xmm2 -; CHECK-SSE2-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE2-NEXT: pxor %xmm1, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 -; CHECK-SSE2-NEXT: psrld $31, %xmm0 +; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrld $2, %xmm1 +; CHECK-SSE2-NEXT: pslld $30, %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 +; CHECK-SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: pandn {{.*}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: test_urem_even_neg100: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; CHECK-SSE41-NEXT: psrld $5, %xmm1 -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm2 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-SSE41-NEXT: psrld $27, %xmm2 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm2 -; CHECK-SSE41-NEXT: psubd %xmm2, %xmm0 -; CHECK-SSE41-NEXT: pxor %xmm1, %xmm1 +; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 +; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pslld $30, %xmm0 +; CHECK-SSE41-NEXT: por %xmm1, %xmm0 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,42949672,1,42949672] +; CHECK-SSE41-NEXT: pminud %xmm0, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqd %xmm1, %xmm0 ; CHECK-SSE41-NEXT: psrld $31, %xmm0 ; CHECK-SSE41-NEXT: retq ; ; CHECK-AVX1-LABEL: test_urem_even_neg100: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-AVX1-NEXT: vpsrld $27, %xmm2, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX1-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX1-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_even_neg100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1374389535,1374389535,1374389535,1374389535] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [536870925,536870925,536870925,536870925] -; CHECK-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; CHECK-AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq