Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -322,6 +322,8 @@ case ISD::ADD: case ISD::SUB: case ISD::MUL: + case ISD::MULHS: + case ISD::MULHU: case ISD::SDIV: case ISD::UDIV: case ISD::SREM: Index: test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll =================================================================== --- test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll +++ test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll @@ -2,10 +2,11 @@ define <16 x i8> @div16xi8(<16 x i8> %x) { ; CHECK-LABEL: div16xi8: -; CHECK: movi [[DIVISOR:(v[0-9]+)]].16b, #41 -; CHECK-NEXT: smull2 [[SMULL2:(v[0-9]+)]].8h, v0.16b, [[DIVISOR]].16b -; CHECK-NEXT: smull [[SMULL:(v[0-9]+)]].8h, v0.8b, [[DIVISOR]].8b -; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).16b]], [[SMULL]].16b, [[SMULL2]].16b +; CHECK: movi [[DIVISOR:(v[0-9]+)]].8b, #41 +; CHECK-NEXT: ext [[EXT:(v[0-9]+)]].16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: smull [[SMULL1:(v[0-9]+)]].8h, v0.8b, [[DIVISOR]].8b +; CHECK-NEXT: smull [[SMULL2:(v[0-9]+)]].8h, [[EXT]].8b, [[DIVISOR]].8b +; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).16b]], [[SMULL1]].16b, [[SMULL2]].16b ; CHECK-NEXT: sshr [[SSHR:(v[0-9]+.16b)]], [[UZP2]], #2 ; CHECK-NEXT: usra v0.16b, [[SSHR]], #7 %div = sdiv <16 x i8> %x, @@ -15,10 +16,11 @@ define <8 x i16> @div8xi16(<8 x i16> %x) { ; CHECK-LABEL: div8xi16: ; CHECK: mov [[TMP:(w[0-9]+)]], #40815 -; CHECK-NEXT: dup [[DIVISOR:(v[0-9]+)]].8h, [[TMP]] -; CHECK-NEXT: smull2 [[SMULL2:(v[0-9]+)]].4s, v0.8h, [[DIVISOR]].8h -; CHECK-NEXT: smull [[SMULL:(v[0-9]+)]].4s, v0.4h, [[DIVISOR]].4h -; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).8h]], [[SMULL]].8h, [[SMULL2]].8h +; CHECK-NEXT: ext [[EXT:(v[0-9]+)]].16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: dup [[DIVISOR:(v[0-9]+)]].4h, [[TMP]] +; CHECK-NEXT: smull [[SMULL1:(v[0-9]+)]].4s, v0.4h, [[DIVISOR]].4h +; CHECK-NEXT: smull [[SMULL2:(v[0-9]+)]].4s, [[EXT]].4h, [[DIVISOR]].4h +; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).8h]], [[SMULL1]].8h, [[SMULL2]].8h ; CHECK-NEXT: add [[ADD:(v[0-9]+).8h]], [[UZP2]], v0.8h ; CHECK-NEXT: sshr [[SSHR:(v[0-9]+).8h]], [[ADD]], #12 ; CHECK-NEXT: usra v0.8h, [[SSHR]], #15 @@ -30,10 +32,11 @@ ; CHECK-LABEL: div32xi4: ; CHECK: mov [[TMP:(w[0-9]+)]], #7527 ; CHECK-NEXT: movk [[TMP]], #28805, lsl #16 -; CHECK-NEXT: dup [[DIVISOR:(v[0-9]+)]].4s, [[TMP]] -; CHECK-NEXT: smull2 [[SMULL2:(v[0-9]+)]].2d, v0.4s, [[DIVISOR]].4s -; CHECK-NEXT: smull [[SMULL:(v[0-9]+)]].2d, v0.2s, [[DIVISOR]].2s -; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).4s]], [[SMULL]].4s, [[SMULL2]].4s +; CHECK-NEXT: ext [[EXT:(v[0-9]+)]].16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: dup [[DIVISOR:(v[0-9]+)]].2s, [[TMP]] +; CHECK-NEXT: smull [[SMULL1:(v[0-9]+)]].2d, v0.2s, [[DIVISOR]].2s +; CHECK-NEXT: smull [[SMULL2:(v[0-9]+)]].2d, [[EXT]].2s, [[DIVISOR]].2s +; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).4s]], [[SMULL1]].4s, [[SMULL2]].4s ; CHECK-NEXT: sshr [[SSHR:(v[0-9]+.4s)]], [[UZP2]], #22 ; CHECK-NEXT: usra v0.4s, [[UZP2]], #31 %div = sdiv <4 x i32> %x, @@ -42,10 +45,11 @@ define <16 x i8> @udiv16xi8(<16 x i8> %x) { ; CHECK-LABEL: udiv16xi8: -; CHECK: movi [[DIVISOR:(v[0-9]+)]].16b, #121 -; CHECK-NEXT: umull2 [[UMULL2:(v[0-9]+)]].8h, v0.16b, [[DIVISOR]].16b -; CHECK-NEXT: umull [[UMULL:(v[0-9]+)]].8h, v0.8b, [[DIVISOR]].8b -; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).16b]], [[UMULL]].16b, [[UMULL2]].16b +; CHECK: movi [[DIVISOR:(v[0-9]+)]].8b, #121 +; CHECK-NEXT: ext [[EXT:(v[0-9]+)]].16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: umull [[UMULL1:(v[0-9]+)]].8h, v0.8b, [[DIVISOR]].8b +; CHECK-NEXT: umull [[UMULL2:(v[0-9]+)]].8h, [[EXT]].8b, [[DIVISOR]].8b +; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).16b]], [[UMULL1]].16b, [[UMULL2]].16b ; CHECK-NEXT: ushr v0.16b, [[UZP2]], #5 %div = udiv <16 x i8> %x, ret <16 x i8> %div @@ -54,10 +58,11 @@ define <8 x i16> @udiv8xi16(<8 x i16> %x) { ; CHECK-LABEL: udiv8xi16: ; CHECK: mov [[TMP:(w[0-9]+)]], #16593 -; CHECK-NEXT: dup [[DIVISOR:(v[0-9]+)]].8h, [[TMP]] -; CHECK-NEXT: umull2 [[UMULL2:(v[0-9]+)]].4s, v0.8h, [[DIVISOR]].8h -; CHECK-NEXT: umull [[UMULL:(v[0-9]+)]].4s, v0.4h, [[DIVISOR]].4h -; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).8h]], [[UMULL]].8h, [[SMULL2]].8h +; CHECK-NEXT: ext [[EXT:(v[0-9]+)]].16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: dup [[DIVISOR:(v[0-9]+)]].4h, [[TMP]] +; CHECK-NEXT: umull [[UMULL1:(v[0-9]+)]].4s, v0.4h, [[DIVISOR]].4h +; CHECK-NEXT: umull [[UMULL2:(v[0-9]+)]].4s, [[EXT]].4h, [[DIVISOR]].4h +; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).8h]], [[UMULL1]].8h, [[UMULL2]].8h ; CHECK-NEXT: sub [[SUB:(v[0-9]+).8h]], v0.8h, [[UZP2]] ; CHECK-NEXT: usra [[USRA:(v[0-9]+).8h]], [[SUB]], #1 ; CHECK-NEXT: ushr v0.8h, [[USRA]], #12 @@ -69,10 +74,11 @@ ; CHECK-LABEL: udiv32xi4: ; CHECK: mov [[TMP:(w[0-9]+)]], #16747 ; CHECK-NEXT: movk [[TMP]], #31439, lsl #16 -; CHECK-NEXT: dup [[DIVISOR:(v[0-9]+)]].4s, [[TMP]] -; CHECK-NEXT: umull2 [[UMULL2:(v[0-9]+)]].2d, v0.4s, [[DIVISOR]].4s -; CHECK-NEXT: umull [[UMULL:(v[0-9]+)]].2d, v0.2s, [[DIVISOR]].2s -; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).4s]], [[UMULL]].4s, [[SMULL2]].4s +; CHECK-NEXT: ext [[EXT:(v[0-9]+)]].16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: dup [[DIVISOR:(v[0-9]+)]].2s, [[TMP]] +; CHECK-NEXT: umull [[UMULL1:(v[0-9]+)]].2d, v0.2s, [[DIVISOR]].2s +; CHECK-NEXT: umull [[UMULL2:(v[0-9]+)]].2d, [[EXT]].2s, [[DIVISOR]].2s +; CHECK-NEXT: uzp2 [[UZP2:(v[0-9]+).4s]], [[UMULL1]].4s, [[UMULL2]].4s ; CHECK-NEXT: ushr v0.4s, [[UZP2]], #22 %div = udiv <4 x i32> %x, ret <4 x i32> %div Index: test/CodeGen/AArch64/neon-idiv.ll =================================================================== --- test/CodeGen/AArch64/neon-idiv.ll +++ test/CodeGen/AArch64/neon-idiv.ll @@ -7,9 +7,11 @@ ; to MULHS due the simplification by multiplying by a magic number ; (TargetLowering::BuildSDIV). ; CHECK-LABEL: test1: -; CHECK: smull2 [[SMULL2:(v[0-9]+)]].2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s -; CHECK: smull [[SMULL:(v[0-9]+)]].2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s -; CHECK: uzp2 [[UZP2:(v[0-9]+).4s]], [[SMULL]].4s, [[SMULL2]].4s +; CHECK: ext [[EXT:(v[0-9]+)]].16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8 +; CHECK: dup [[DUP:(v[0-9]+)]].2s, w8 +; CHECK: smull [[SMULL1:(v[0-9]+)]].2d, {{v[0-9]+}}.2s, [[DUP]].2s +; CHECK: smull [[SMULL2:(v[0-9]+)]].2d, [[EXT]].2s, [[DUP]].2s +; CHECK: uzp2 [[UZP2:(v[0-9]+).4s]], [[SMULL1]].4s, [[SMULL2]].4s ; CHECK: add [[ADD:(v[0-9]+.4s)]], [[UZP2]], v0.4s ; CHECK: sshr [[SSHR:(v[0-9]+.4s)]], [[ADD]], #2 } Index: test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll =================================================================== --- test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll +++ test/CodeGen/AArch64/urem-seteq-vec-nonsplat.ll @@ -6,25 +6,30 @@ define <4 x i32> @test_urem_odd_div(<4 x i32> %X) nounwind readnone { ; CHECK-LABEL: test_urem_odd_div: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI0_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0] ; CHECK-NEXT: adrp x8, .LCPI0_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_1] +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr d4, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: adrp x8, .LCPI0_2 -; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v3.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v3.4s, v2.4s -; CHECK-NEXT: umull v2.2d, v3.2s, v2.2s -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: umull v1.2d, v2.2s, v1.2s +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: umull v4.2d, v0.2s, v4.2s +; CHECK-NEXT: uzp2 v1.4s, v4.4s, v1.4s +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: sub v4.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI0_3 -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v4.4s +; CHECK-NEXT: umull v3.2d, v4.2s, v3.2s +; CHECK-NEXT: ext v4.16b, v4.16b, v4.16b, #8 +; CHECK-NEXT: umull v2.2d, v4.2s, v2.2s ; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI0_3] -; CHECK-NEXT: neg v3.4s, v3.4s +; CHECK-NEXT: adrp x8, .LCPI0_4 +; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_4] +; CHECK-NEXT: neg v4.4s, v4.4s ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v4.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v4.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -41,19 +46,22 @@ ; CHECK-NEXT: adrp x8, .LCPI1_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] ; CHECK-NEXT: adrp x8, .LCPI1_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_1] +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI1_1] ; CHECK-NEXT: adrp x8, .LCPI1_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_2] -; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI1_2] ; CHECK-NEXT: adrp x8, .LCPI1_3 +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI1_3] ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI1_3] -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: adrp x8, .LCPI1_4 +; CHECK-NEXT: umull v2.2d, v1.2s, v2.2s +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: umull v1.2d, v1.2s, v3.2s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI1_4] +; CHECK-NEXT: uzp2 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: neg v2.4s, v4.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -68,15 +76,18 @@ ; CHECK-LABEL: test_urem_pow2: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0] ; CHECK-NEXT: adrp x8, .LCPI2_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_1] +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI2_1] ; CHECK-NEXT: adrp x8, .LCPI2_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI2_2] +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: adrp x8, .LCPI2_3 +; CHECK-NEXT: umull v2.2d, v2.2s, v3.2s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI2_3] ; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: neg v2.4s, v4.4s ; CHECK-NEXT: ushl v1.4s, v1.4s, v2.4s ; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 @@ -95,22 +106,25 @@ ; CHECK-NEXT: adrp x8, .LCPI3_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0] ; CHECK-NEXT: adrp x8, .LCPI3_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_1] +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI3_1] ; CHECK-NEXT: adrp x8, .LCPI3_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_2] -; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI3_2] ; CHECK-NEXT: adrp x8, .LCPI3_3 +; CHECK-NEXT: neg v1.4s, v1.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_3] ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s -; CHECK-NEXT: umull2 v4.2d, v1.4s, v2.4s -; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_3] ; CHECK-NEXT: adrp x8, .LCPI3_4 -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s -; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_4] -; CHECK-NEXT: neg v3.4s, v3.4s -; CHECK-NEXT: ushl v1.4s, v1.4s, v3.4s -; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b -; CHECK-NEXT: mls v0.4s, v2.4s, v4.4s +; CHECK-NEXT: umull v2.2d, v1.2s, v2.2s +; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: umull v1.2d, v1.2s, v3.2s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI3_4] +; CHECK-NEXT: adrp x8, .LCPI3_5 +; CHECK-NEXT: uzp2 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI3_5] +; CHECK-NEXT: neg v4.4s, v4.4s +; CHECK-NEXT: ushl v1.4s, v1.4s, v4.4s +; CHECK-NEXT: bsl v3.16b, v0.16b, v1.16b +; CHECK-NEXT: mls v0.4s, v3.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -126,15 +140,16 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: adrp x9, .LCPI4_0 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI4_0] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v2.4s +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: dup v2.2s, w8 +; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI4_0] +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s ; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v4.4s -; CHECK-NEXT: movi v1.4s, #5 -; CHECK-NEXT: ushr v2.4s, v2.4s, #2 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: uzp2 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: movi v2.4s, #5 +; CHECK-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, v3.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -149,17 +164,18 @@ ; CHECK-LABEL: test_urem_both: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI5_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI5_0] ; CHECK-NEXT: adrp x8, .LCPI5_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI5_1] +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_1] ; CHECK-NEXT: adrp x8, .LCPI5_2 -; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI5_2] -; CHECK-NEXT: umull2 v4.2d, v0.4s, v1.4s -; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI5_2] +; CHECK-NEXT: umull v1.2d, v1.2s, v2.2s +; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s +; CHECK-NEXT: uzp2 v1.4s, v2.4s, v1.4s ; CHECK-NEXT: ushr v1.4s, v1.4s, #2 -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-NEXT: cmeq v0.4s, v0.4s, v3.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s +; CHECK-NEXT: cmeq v0.4s, v0.4s, v4.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -187,13 +203,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s -; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: movi v1.4s, #5 -; CHECK-NEXT: ushr v2.4s, v2.4s, #2 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: dup v3.2s, w8 +; CHECK-NEXT: umull v4.2d, v0.2s, v3.2s +; CHECK-NEXT: umull v1.2d, v1.2s, v3.2s +; CHECK-NEXT: uzp2 v1.4s, v4.4s, v1.4s +; CHECK-NEXT: movi v2.4s, #5 +; CHECK-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -220,15 +237,20 @@ define <4 x i32> @test_urem_div_even_odd(<4 x i32> %X) nounwind readnone { ; CHECK-LABEL: test_urem_div_even_odd: ; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #52429 +; CHECK-NEXT: movk w8, #52428, lsl #16 +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: mov w8, #43691 +; CHECK-NEXT: movk w8, #43690, lsl #16 +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: dup v3.2s, w8 ; CHECK-NEXT: adrp x8, .LCPI9_0 -; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] -; CHECK-NEXT: adrp x8, .LCPI9_1 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI9_1] -; CHECK-NEXT: umull2 v3.2d, v0.4s, v1.4s +; CHECK-NEXT: umull v2.2d, v2.2s, v3.2s +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI9_0] ; CHECK-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ushr v1.4s, v1.4s, #2 -; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-NEXT: mls v0.4s, v1.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b Index: test/CodeGen/AArch64/urem-seteq-vec-splat.ll =================================================================== --- test/CodeGen/AArch64/urem-seteq-vec-splat.ll +++ test/CodeGen/AArch64/urem-seteq-vec-splat.ll @@ -8,13 +8,14 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: dup v2.4s, w8 -; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s -; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s -; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: movi v1.4s, #5 -; CHECK-NEXT: ushr v2.4s, v2.4s, #2 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: dup v3.2s, w8 +; CHECK-NEXT: umull v4.2d, v0.2s, v3.2s +; CHECK-NEXT: umull v1.2d, v1.2s, v3.2s +; CHECK-NEXT: uzp2 v1.4s, v4.4s, v1.4s +; CHECK-NEXT: movi v2.4s, #5 +; CHECK-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b @@ -75,10 +76,11 @@ ; CHECK-NEXT: mov w8, #9363 ; CHECK-NEXT: movk w8, #37449, lsl #16 ; CHECK-NEXT: ushr v1.4s, v0.4s, #1 -; CHECK-NEXT: dup v3.4s, w8 -; CHECK-NEXT: umull2 v4.2d, v1.4s, v3.4s +; CHECK-NEXT: dup v3.2s, w8 +; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8 ; CHECK-NEXT: umull v1.2d, v1.2s, v3.2s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s +; CHECK-NEXT: umull v3.2d, v4.2s, v3.2s +; CHECK-NEXT: uzp2 v1.4s, v1.4s, v3.4s ; CHECK-NEXT: movi v2.4s, #14 ; CHECK-NEXT: ushr v1.4s, v1.4s, #2 ; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s Index: test/CodeGen/X86/combine-udiv.ll =================================================================== --- test/CodeGen/X86/combine-udiv.ll +++ test/CodeGen/X86/combine-udiv.ll @@ -647,18 +647,17 @@ ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movl $255, %eax -; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movl $171, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pmullw %xmm2, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm3 -; SSE2-NEXT: psrlw $8, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pmullw %xmm2, %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: movl $255, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_udiv_nonuniform4: @@ -670,14 +669,12 @@ ; SSE41-NEXT: pmullw %xmm0, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: psllw $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7] +; SSE41-NEXT: psllw $1, %xmm0 +; SSE41-NEXT: psllw $8, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: packuswb %xmm0, %xmm2 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm0 @@ -690,14 +687,12 @@ ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm2 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq @@ -726,13 +721,9 @@ ; XOP-NEXT: movl $171, %eax ; XOP-NEXT: vmovd %eax, %xmm1 ; XOP-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm2 -; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; XOP-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; XOP-NEXT: vpmullw %xmm1, %xmm3, %xmm1 -; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm2[1,3,5,7,9,11,13,15],xmm1[1,3,5,7,9,11,13,15] +; XOP-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15] ; XOP-NEXT: movl $249, %eax ; XOP-NEXT: vmovd %eax, %xmm2 ; XOP-NEXT: vpshlb %xmm2, %xmm1, %xmm1 Index: test/CodeGen/X86/urem-seteq-vec-nonsplat.ll =================================================================== --- test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -629,12 +629,12 @@ ; ; CHECK-AVX1-LABEL: test_urem_both: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-9.255967385052751E+61,-9.255967385052751E+61] -; CHECK-AVX1-NEXT: # xmm1 = mem[0,0] +; CHECK-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2863311531,3435973837,2863311531,3435973837] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; CHECK-AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; CHECK-AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 @@ -645,13 +645,14 @@ ; ; CHECK-AVX2-LABEL: test_urem_both: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14757395262689946283,14757395262689946283] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-AVX2-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX2-NEXT: vmovd %eax, %xmm2 +; CHECK-AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 +; CHECK-AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; CHECK-AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 @@ -661,13 +662,13 @@ ; ; CHECK-AVX512VL-LABEL: test_urem_both: ; CHECK-AVX512VL: # %bb.0: -; CHECK-AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14757395262689946283,14757395262689946283] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 -; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] +; CHECK-AVX512VL-NEXT: movl $-858993459, %eax # imm = 0xCCCCCCCD +; CHECK-AVX512VL-NEXT: vpbroadcastd %eax, %xmm1 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpmuludq %xmm1, %xmm2, %xmm1 +; CHECK-AVX512VL-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm2 +; CHECK-AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; CHECK-AVX512VL-NEXT: vpsrld $2, %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0