diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6016,6 +6016,23 @@ return SDValue(); } + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Try to use leading zeros of the dividend to reduce the multiplier and + // avoid expensive fixups. + // TODO: Support vectors. + unsigned LeadingZeros = 0; + if (!VT.isVector() && isa(N1)) { + assert(!isOneConstant(N1) && "Unexpected divisor"); + LeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros(); + // UnsignedDivisionByConstantInfo doesn't work correctly if leading zeros in + // the dividend exceeds the leading zeros for the divisor. + LeadingZeros = + std::min(LeadingZeros, + cast(N1)->getAPIntValue().countLeadingZeros()); + } + bool UseNPQ = false; SmallVector PreShifts, PostShifts, MagicFactors, NPQFactors; @@ -6026,7 +6043,7 @@ // bits are known to be zero. const APInt& Divisor = C->getAPIntValue(); UnsignedDivisionByConstantInfo magics = - UnsignedDivisionByConstantInfo::get(Divisor); + UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros); unsigned PreShift = 0, PostShift = 0; // If the divisor is even, we can avoid using the expensive fixup by @@ -6034,8 +6051,8 @@ if (magics.IsAdd && !Divisor[0]) { PreShift = Divisor.countTrailingZeros(); // Get magic number for the shifted divisor. - magics = - UnsignedDivisionByConstantInfo::get(Divisor.lshr(PreShift), PreShift); + magics = UnsignedDivisionByConstantInfo::get(Divisor.lshr(PreShift), + PreShift + LeadingZeros); assert(!magics.IsAdd && "Should use cheap fixup now"); } @@ -6061,9 +6078,6 @@ return true; }; - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - // Collect the shifts/magic values from each element. if (!ISD::matchUnaryPredicate(N1, BuildUDIVPattern)) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll --- a/llvm/test/CodeGen/AArch64/funnel-shift.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll @@ -69,15 +69,14 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-LABEL: fshl_i37: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #31883 +; CHECK-NEXT: mov x9, #46053 ; CHECK-NEXT: and x8, x2, #0x1fffffffff -; CHECK-NEXT: movk x9, #3542, lsl #16 +; CHECK-NEXT: movk x9, #12398, lsl #16 ; CHECK-NEXT: ubfiz x10, x1, #26, #37 -; CHECK-NEXT: movk x9, #51366, lsl #32 -; CHECK-NEXT: movk x9, #56679, lsl #48 +; CHECK-NEXT: movk x9, #15941, lsl #32 +; CHECK-NEXT: movk x9, #1771, lsl #48 ; CHECK-NEXT: umulh x8, x8, x9 ; CHECK-NEXT: mov w9, #37 -; CHECK-NEXT: ubfx x8, x8, #5, #27 ; CHECK-NEXT: msub w8, w8, w9, w2 ; CHECK-NEXT: mvn w9, w8 ; CHECK-NEXT: lsl x8, x0, x8 @@ -207,16 +206,15 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) { ; CHECK-LABEL: fshr_i37: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x9, #31883 +; CHECK-NEXT: mov x9, #46053 ; CHECK-NEXT: and x8, x2, #0x1fffffffff -; CHECK-NEXT: movk x9, #3542, lsl #16 +; CHECK-NEXT: movk x9, #12398, lsl #16 ; CHECK-NEXT: lsl x10, x1, #27 -; CHECK-NEXT: movk x9, #51366, lsl #32 +; CHECK-NEXT: movk x9, #15941, lsl #32 ; CHECK-NEXT: lsl x11, x0, #1 -; CHECK-NEXT: movk x9, #56679, lsl #48 +; CHECK-NEXT: movk x9, #1771, lsl #48 ; CHECK-NEXT: umulh x8, x8, x9 ; CHECK-NEXT: mov w9, #37 -; CHECK-NEXT: lsr x8, x8, #5 ; CHECK-NEXT: msub w8, w8, w9, w2 ; CHECK-NEXT: add w8, w8, #27 ; CHECK-NEXT: mvn w9, w8 diff --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll --- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -6,40 +6,36 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: mov w9, #8969 -; CHECK-NEXT: movk w9, #22765, lsl #16 +; CHECK-NEXT: mov w9, #55879 +; CHECK-NEXT: movk w9, #689, lsl #16 ; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: mov w12, #16913 -; CHECK-NEXT: mov w13, #95 -; CHECK-NEXT: movk w12, #8456, lsl #16 +; CHECK-NEXT: mov w11, #33826 +; CHECK-NEXT: mov w12, #95 +; CHECK-NEXT: movk w11, #528, lsl #16 +; CHECK-NEXT: umov w13, v0.h[2] ; CHECK-NEXT: umull x9, w8, w9 -; CHECK-NEXT: ubfx w14, w10, #2, #14 +; CHECK-NEXT: umull x11, w10, w11 ; CHECK-NEXT: lsr x9, x9, #32 -; CHECK-NEXT: sub w11, w8, w9 -; CHECK-NEXT: umull x12, w14, w12 -; CHECK-NEXT: add w9, w9, w11, lsr #1 -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: lsr w9, w9, #6 -; CHECK-NEXT: lsr x12, x12, #34 -; CHECK-NEXT: msub w8, w9, w13, w8 -; CHECK-NEXT: mov w9, #33437 -; CHECK-NEXT: movk w9, #21399, lsl #16 -; CHECK-NEXT: mov w13, #124 -; CHECK-NEXT: umull x9, w11, w9 -; CHECK-NEXT: msub w10, w12, w13, w10 -; CHECK-NEXT: umov w12, v0.h[3] +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: msub w8, w9, w12, w8 +; CHECK-NEXT: mov w9, #48149 +; CHECK-NEXT: movk w9, #668, lsl #16 +; CHECK-NEXT: mov w12, #124 +; CHECK-NEXT: umull x9, w13, w9 +; CHECK-NEXT: msub w10, w11, w12, w10 +; CHECK-NEXT: umov w11, v0.h[3] ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: mov w13, #2287 -; CHECK-NEXT: lsr x8, x9, #37 +; CHECK-NEXT: mov w12, #22281 +; CHECK-NEXT: lsr x8, x9, #32 ; CHECK-NEXT: mov w9, #98 -; CHECK-NEXT: movk w13, #16727, lsl #16 -; CHECK-NEXT: msub w8, w8, w9, w11 +; CHECK-NEXT: movk w12, #65, lsl #16 +; CHECK-NEXT: msub w8, w8, w9, w13 ; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: umull x9, w12, w13 +; CHECK-NEXT: umull x9, w11, w12 ; CHECK-NEXT: mov w10, #1003 -; CHECK-NEXT: lsr x9, x9, #40 +; CHECK-NEXT: lsr x9, x9, #32 ; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: msub w8, w9, w10, w12 +; CHECK-NEXT: msub w8, w9, w10, w11 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -51,40 +47,28 @@ ; CHECK-LABEL: fold_urem_vec_2: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: mov w8, #8969 -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: movk w8, #22765, lsl #16 -; CHECK-NEXT: umov w15, v0.h[2] -; CHECK-NEXT: umov w16, v0.h[3] -; CHECK-NEXT: umull x12, w10, w8 -; CHECK-NEXT: umull x11, w9, w8 -; CHECK-NEXT: lsr x12, x12, #32 +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: mov w9, #55879 +; CHECK-NEXT: movk w9, #689, lsl #16 +; CHECK-NEXT: umov w10, v0.h[1] +; CHECK-NEXT: mov w12, #95 +; CHECK-NEXT: umov w13, v0.h[2] +; CHECK-NEXT: umull x11, w8, w9 +; CHECK-NEXT: umull x14, w10, w9 ; CHECK-NEXT: lsr x11, x11, #32 -; CHECK-NEXT: sub w14, w10, w12 -; CHECK-NEXT: sub w13, w9, w11 -; CHECK-NEXT: add w12, w12, w14, lsr #1 -; CHECK-NEXT: umull x14, w15, w8 -; CHECK-NEXT: add w11, w11, w13, lsr #1 -; CHECK-NEXT: mov w13, #95 -; CHECK-NEXT: lsr w12, w12, #6 -; CHECK-NEXT: lsr w11, w11, #6 -; CHECK-NEXT: umull x8, w16, w8 -; CHECK-NEXT: msub w10, w12, w13, w10 -; CHECK-NEXT: lsr x12, x14, #32 -; CHECK-NEXT: msub w9, w11, w13, w9 -; CHECK-NEXT: sub w11, w15, w12 -; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: fmov s0, w10 -; CHECK-NEXT: add w10, w12, w11, lsr #1 -; CHECK-NEXT: lsr w10, w10, #6 -; CHECK-NEXT: sub w11, w16, w8 -; CHECK-NEXT: mov v0.h[1], w9 -; CHECK-NEXT: msub w9, w10, w13, w15 -; CHECK-NEXT: add w8, w8, w11, lsr #1 -; CHECK-NEXT: lsr w8, w8, #6 -; CHECK-NEXT: mov v0.h[2], w9 -; CHECK-NEXT: msub w8, w8, w13, w16 +; CHECK-NEXT: msub w8, w11, w12, w8 +; CHECK-NEXT: lsr x11, x14, #32 +; CHECK-NEXT: umull x14, w13, w9 +; CHECK-NEXT: msub w10, w11, w12, w10 +; CHECK-NEXT: umov w11, v0.h[3] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: lsr x8, x14, #32 +; CHECK-NEXT: msub w8, w8, w12, w13 +; CHECK-NEXT: mov v0.h[1], w10 +; CHECK-NEXT: umull x9, w11, w9 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: mov v0.h[2], w8 +; CHECK-NEXT: msub w8, w9, w12, w11 ; CHECK-NEXT: mov v0.h[3], w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret @@ -98,45 +82,33 @@ ; CHECK-LABEL: combine_urem_udiv: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: mov w8, #8969 -; CHECK-NEXT: movk w8, #22765, lsl #16 +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: mov w9, #55879 +; CHECK-NEXT: movk w9, #689, lsl #16 ; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[2] -; CHECK-NEXT: mov w15, #95 -; CHECK-NEXT: umov w13, v0.h[3] -; CHECK-NEXT: umull x12, w9, w8 -; CHECK-NEXT: umull x14, w10, w8 -; CHECK-NEXT: lsr x12, x12, #32 -; CHECK-NEXT: umull x17, w11, w8 -; CHECK-NEXT: sub w16, w9, w12 -; CHECK-NEXT: lsr x14, x14, #32 -; CHECK-NEXT: lsr x17, x17, #32 -; CHECK-NEXT: umull x8, w13, w8 -; CHECK-NEXT: add w12, w12, w16, lsr #1 -; CHECK-NEXT: sub w16, w10, w14 -; CHECK-NEXT: lsr w12, w12, #6 +; CHECK-NEXT: mov w12, #95 +; CHECK-NEXT: umov w14, v0.h[2] +; CHECK-NEXT: umov w15, v0.h[3] +; CHECK-NEXT: umull x11, w8, w9 +; CHECK-NEXT: umull x13, w10, w9 +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: lsr x13, x13, #32 +; CHECK-NEXT: msub w8, w11, w12, w8 +; CHECK-NEXT: msub w10, w13, w12, w10 +; CHECK-NEXT: fmov s1, w11 +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: umull x8, w14, w9 +; CHECK-NEXT: umull x9, w15, w9 ; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: add w14, w14, w16, lsr #1 -; CHECK-NEXT: sub w16, w11, w17 -; CHECK-NEXT: msub w9, w12, w15, w9 -; CHECK-NEXT: lsr w14, w14, #6 -; CHECK-NEXT: add w16, w17, w16, lsr #1 -; CHECK-NEXT: fmov s1, w12 -; CHECK-NEXT: msub w10, w14, w15, w10 -; CHECK-NEXT: sub w17, w13, w8 -; CHECK-NEXT: fmov s0, w9 -; CHECK-NEXT: lsr w9, w16, #6 -; CHECK-NEXT: mov v1.h[1], w14 -; CHECK-NEXT: add w8, w8, w17, lsr #1 -; CHECK-NEXT: msub w11, w9, w15, w11 -; CHECK-NEXT: lsr w8, w8, #6 ; CHECK-NEXT: mov v0.h[1], w10 -; CHECK-NEXT: msub w10, w8, w15, w13 -; CHECK-NEXT: mov v1.h[2], w9 -; CHECK-NEXT: mov v0.h[2], w11 -; CHECK-NEXT: mov v1.h[3], w8 -; CHECK-NEXT: mov v0.h[3], w10 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: msub w10, w8, w12, w14 +; CHECK-NEXT: mov v1.h[1], w13 +; CHECK-NEXT: msub w11, w9, w12, w15 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w11 +; CHECK-NEXT: mov v1.h[3], w9 ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x, @@ -151,25 +123,22 @@ ; CHECK-LABEL: dont_fold_urem_power_of_two: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w10, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: mov w8, #8969 +; CHECK-NEXT: umov w9, v0.h[0] ; CHECK-NEXT: umov w11, v0.h[1] -; CHECK-NEXT: movk w8, #22765, lsl #16 -; CHECK-NEXT: and w10, w10, #0x3f -; CHECK-NEXT: umull x8, w9, w8 -; CHECK-NEXT: and w11, w11, #0x1f +; CHECK-NEXT: umov w10, v0.h[3] +; CHECK-NEXT: mov w8, #55879 +; CHECK-NEXT: movk w8, #689, lsl #16 +; CHECK-NEXT: and w9, w9, #0x3f +; CHECK-NEXT: umull x8, w10, w8 +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: and w9, w11, #0x1f +; CHECK-NEXT: umov w11, v0.h[2] ; CHECK-NEXT: lsr x8, x8, #32 -; CHECK-NEXT: fmov s1, w10 -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: sub w12, w9, w8 -; CHECK-NEXT: mov v1.h[1], w11 -; CHECK-NEXT: add w8, w8, w12, lsr #1 -; CHECK-NEXT: and w10, w10, #0x7 -; CHECK-NEXT: lsr w8, w8, #6 -; CHECK-NEXT: mov w11, #95 -; CHECK-NEXT: msub w8, w8, w11, w9 -; CHECK-NEXT: mov v1.h[2], w10 +; CHECK-NEXT: mov v1.h[1], w9 +; CHECK-NEXT: mov w9, #95 +; CHECK-NEXT: and w11, w11, #0x7 +; CHECK-NEXT: msub w8, w8, w9, w10 +; CHECK-NEXT: mov v1.h[2], w11 ; CHECK-NEXT: mov v1.h[3], w8 ; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret @@ -182,29 +151,28 @@ ; CHECK-LABEL: dont_fold_srem_one: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: mov w8, #30865 -; CHECK-NEXT: movk w8, #51306, lsl #16 -; CHECK-NEXT: umov w11, v0.h[2] +; CHECK-NEXT: umov w8, v0.h[1] +; CHECK-NEXT: mov w9, #13629 +; CHECK-NEXT: movk w9, #100, lsl #16 +; CHECK-NEXT: umov w10, v0.h[2] +; CHECK-NEXT: mov w11, #25645 ; CHECK-NEXT: mov w12, #654 +; CHECK-NEXT: movk w11, #2849, lsl #16 ; CHECK-NEXT: movi d1, #0000000000000000 -; CHECK-NEXT: mov w13, #47143 -; CHECK-NEXT: ubfx w10, w9, #1, #15 -; CHECK-NEXT: movk w13, #24749, lsl #16 -; CHECK-NEXT: umull x8, w10, w8 -; CHECK-NEXT: mov w10, #17097 -; CHECK-NEXT: movk w10, #45590, lsl #16 -; CHECK-NEXT: lsr x8, x8, #40 -; CHECK-NEXT: umull x10, w11, w10 -; CHECK-NEXT: msub w8, w8, w12, w9 +; CHECK-NEXT: umull x9, w8, w9 +; CHECK-NEXT: mov w13, #5560 +; CHECK-NEXT: umull x11, w10, w11 +; CHECK-NEXT: movk w13, #12, lsl #16 +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: lsr x11, x11, #32 +; CHECK-NEXT: msub w8, w9, w12, w8 ; CHECK-NEXT: umov w9, v0.h[3] -; CHECK-NEXT: lsr x10, x10, #36 ; CHECK-NEXT: mov w12, #23 -; CHECK-NEXT: msub w10, w10, w12, w11 +; CHECK-NEXT: msub w10, w11, w12, w10 ; CHECK-NEXT: mov w11, #5423 ; CHECK-NEXT: mov v1.h[1], w8 ; CHECK-NEXT: umull x8, w9, w13 -; CHECK-NEXT: lsr x8, x8, #43 +; CHECK-NEXT: lsr x8, x8, #32 ; CHECK-NEXT: mov v1.h[2], w10 ; CHECK-NEXT: msub w8, w8, w11, w9 ; CHECK-NEXT: mov v1.h[3], w8 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -1335,10 +1335,9 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffffff, v2 -; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab +; SI-NEXT: s_mov_b32 s4, 0xaaaaaab ; SI-NEXT: v_mul_hi_u32 v3, v3, s4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 ; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 ; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2 @@ -1349,10 +1348,9 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v3, 0xffffff, v2 -; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab +; VI-NEXT: s_mov_b32 s4, 0xaaaaaab ; VI-NEXT: v_mul_hi_u32 v3, v3, s4 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 ; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 ; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2 @@ -1363,10 +1361,9 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 0xffffff, v2 -; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab ; GFX9-NEXT: v_mul_hi_u32 v3, v3, s4 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 4, v3 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 @@ -1384,8 +1381,7 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaaab, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 4, v3 +; GFX10-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3 ; GFX10-NEXT: v_mul_u32_u24_e32 v3, 24, v3 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 8, v2 @@ -1399,13 +1395,12 @@ ; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_hi_u32 v3, 0xaaaaaaab, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_hi_u32 v3, 0xaaaaaab, v3 ; GFX11-NEXT: v_mul_u32_u24_e32 v3, 24, v3 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX11-NEXT: v_add_nc_u32_e32 v2, 8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_alignbit_b32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2) @@ -1417,19 +1412,17 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; SI-NEXT: s_mov_b32 s4, 0xaaaaaaab +; SI-NEXT: s_mov_b32 s4, 0xaaaaaab ; SI-NEXT: v_mul_hi_u32 v6, v6, s4 ; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; SI-NEXT: v_mul_hi_u32 v6, v7, s4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 4, v6 -; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 +; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v6 ; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3 @@ -1439,19 +1432,17 @@ ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; VI-NEXT: s_mov_b32 s4, 0xaaaaaaab +; VI-NEXT: s_mov_b32 s4, 0xaaaaaab ; VI-NEXT: v_mul_hi_u32 v6, v6, s4 ; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 ; VI-NEXT: v_mul_hi_u32 v6, v7, s4 ; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: v_lshrrev_b32_e32 v3, 4, v6 -; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v3 +; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v6 ; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3 ; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3 @@ -1461,19 +1452,17 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaaab +; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab ; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 ; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4 ; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 4, v6 -; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v3 +; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v6 ; GFX9-NEXT: v_sub_u32_e32 v3, v5, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 ; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 @@ -1492,10 +1481,8 @@ ; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaaab, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaaab, v7 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 4, v6 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 4, v7 +; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 +; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 ; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; GFX10-NEXT: v_mul_u32_u24_e32 v7, 24, v7 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 @@ -1515,11 +1502,8 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaaab, v6 -; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaaab, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 4, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 4, v7 +; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 +; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7 diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -701,16 +701,15 @@ ; ARMT2-NEXT: sub sp, sp, #4 ; ARMT2-NEXT: ldr r1, [sp] ; ARMT2-NEXT: mov r0, #33 -; ARMT2-NEXT: movw r2, #52429 -; ARMT2-NEXT: movt r2, #52428 +; ARMT2-NEXT: movw r2, #39322 +; ARMT2-NEXT: movt r2, #6553 ; ARMT2-NEXT: bfi r1, r0, #0, #12 ; ARMT2-NEXT: mov r0, #10 ; ARMT2-NEXT: bfi r1, r0, #12, #13 ; ARMT2-NEXT: mov r0, r1 ; ARMT2-NEXT: bfc r0, #12, #20 ; ARMT2-NEXT: umull r2, r3, r0, r2 -; ARMT2-NEXT: lsr r2, r3, #3 -; ARMT2-NEXT: add r2, r2, r2, lsl #2 +; ARMT2-NEXT: add r2, r3, r3, lsl #2 ; ARMT2-NEXT: sub r0, r0, r2, lsl #1 ; ARMT2-NEXT: movw r2, #40960 ; ARMT2-NEXT: movt r2, #65024 @@ -764,16 +763,15 @@ ; THUMB2-NEXT: sub sp, #4 ; THUMB2-NEXT: ldr r1, [sp] ; THUMB2-NEXT: movs r0, #33 -; THUMB2-NEXT: movw r2, #52429 +; THUMB2-NEXT: movw r2, #39322 ; THUMB2-NEXT: bfi r1, r0, #0, #12 ; THUMB2-NEXT: movs r0, #10 ; THUMB2-NEXT: bfi r1, r0, #12, #13 ; THUMB2-NEXT: mov r0, r1 -; THUMB2-NEXT: movt r2, #52428 +; THUMB2-NEXT: movt r2, #6553 ; THUMB2-NEXT: bfc r0, #12, #20 ; THUMB2-NEXT: umull r2, r3, r0, r2 -; THUMB2-NEXT: lsrs r2, r3, #3 -; THUMB2-NEXT: add.w r2, r2, r2, lsl #2 +; THUMB2-NEXT: add.w r2, r3, r3, lsl #2 ; THUMB2-NEXT: sub.w r0, r0, r2, lsl #1 ; THUMB2-NEXT: movw r2, #40960 ; THUMB2-NEXT: movt r2, #65024 diff --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll --- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll +++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll @@ -352,15 +352,14 @@ ; ; CHECK64-LABEL: fshl_i37: ; CHECK64: # %bb.0: -; CHECK64-NEXT: lis 6, 28339 +; CHECK64-NEXT: lis 6, 1771 ; CHECK64-NEXT: clrldi 7, 5, 27 -; CHECK64-NEXT: ori 6, 6, 58451 +; CHECK64-NEXT: ori 6, 6, 15941 ; CHECK64-NEXT: sldi 4, 4, 27 -; CHECK64-NEXT: rldic 6, 6, 33, 0 -; CHECK64-NEXT: oris 6, 6, 3542 -; CHECK64-NEXT: ori 6, 6, 31883 +; CHECK64-NEXT: rldic 6, 6, 32, 5 +; CHECK64-NEXT: oris 6, 6, 12398 +; CHECK64-NEXT: ori 6, 6, 46053 ; CHECK64-NEXT: mulhdu 6, 7, 6 -; CHECK64-NEXT: rldicl 6, 6, 59, 5 ; CHECK64-NEXT: mulli 6, 6, 37 ; CHECK64-NEXT: sub 5, 5, 6 ; CHECK64-NEXT: clrlwi 5, 5, 26 @@ -649,15 +648,14 @@ ; ; CHECK64-LABEL: fshr_i37: ; CHECK64: # %bb.0: -; CHECK64-NEXT: lis 6, 28339 +; CHECK64-NEXT: lis 6, 1771 ; CHECK64-NEXT: clrldi 7, 5, 27 -; CHECK64-NEXT: ori 6, 6, 58451 +; CHECK64-NEXT: ori 6, 6, 15941 ; CHECK64-NEXT: sldi 4, 4, 27 -; CHECK64-NEXT: rldic 6, 6, 33, 0 -; CHECK64-NEXT: oris 6, 6, 3542 -; CHECK64-NEXT: ori 6, 6, 31883 +; CHECK64-NEXT: rldic 6, 6, 32, 5 +; CHECK64-NEXT: oris 6, 6, 12398 +; CHECK64-NEXT: ori 6, 6, 46053 ; CHECK64-NEXT: mulhdu 6, 7, 6 -; CHECK64-NEXT: rldicl 6, 6, 59, 5 ; CHECK64-NEXT: mulli 6, 6, 37 ; CHECK64-NEXT: sub 5, 5, 6 ; CHECK64-NEXT: addi 5, 5, 27 diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll --- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll @@ -11,102 +11,86 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; P9LE-LABEL: fold_urem_vec_1: ; P9LE: # %bb.0: -; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: lis r4, 21399 -; P9LE-NEXT: lis r5, 8456 +; P9LE-NEXT: li r3, 0 +; P9LE-NEXT: lis r4, 689 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 33437 -; P9LE-NEXT: ori r5, r5, 16913 +; P9LE-NEXT: ori r4, r4, 55879 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 5 -; P9LE-NEXT: mulli r4, r4, 98 +; P9LE-NEXT: mulli r4, r4, 95 ; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 16727 +; P9LE-NEXT: lis r4, 528 ; P9LE-NEXT: mtvsrd v3, r3 -; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 2287 +; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: ori r4, r4, 33826 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 8 -; P9LE-NEXT: mulli r4, r4, 1003 +; P9LE-NEXT: mulli r4, r4, 124 ; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: lis r4, 668 ; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: li r3, 4 +; P9LE-NEXT: ori r4, r4, 48149 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r4, r3, 16 -; P9LE-NEXT: rlwinm r3, r3, 30, 18, 31 -; P9LE-NEXT: mulhwu r3, r3, r5 -; P9LE-NEXT: srwi r3, r3, 2 -; P9LE-NEXT: mulli r3, r3, 124 -; P9LE-NEXT: sub r3, r4, r3 -; P9LE-NEXT: lis r4, 22765 +; P9LE-NEXT: clrlwi r3, r3, 16 +; P9LE-NEXT: mulhwu r4, r3, r4 +; P9LE-NEXT: mulli r4, r4, 98 +; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: lis r4, 65 ; P9LE-NEXT: mtvsrd v4, r3 -; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: ori r4, r4, 8969 +; P9LE-NEXT: li r3, 6 +; P9LE-NEXT: ori r4, r4, 22281 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: srwi r5, r5, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 6 -; P9LE-NEXT: mulli r4, r4, 95 +; P9LE-NEXT: mulli r4, r4, 1003 ; P9LE-NEXT: sub r3, r3, r4 ; P9LE-NEXT: mtvsrd v2, r3 -; P9LE-NEXT: vmrghh v2, v4, v2 -; P9LE-NEXT: xxmrglw v2, v3, v2 +; P9LE-NEXT: vmrghh v2, v2, v4 +; P9LE-NEXT: xxmrglw v2, v2, v3 ; P9LE-NEXT: blr ; ; P9BE-LABEL: fold_urem_vec_1: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, 16727 -; P9BE-NEXT: lis r5, 8456 +; P9BE-NEXT: lis r4, 65 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 2287 -; P9BE-NEXT: ori r5, r5, 16913 +; P9BE-NEXT: ori r4, r4, 22281 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 8 ; P9BE-NEXT: mulli r4, r4, 1003 ; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, 21399 +; P9BE-NEXT: lis r4, 668 ; P9BE-NEXT: mtfprwz f0, r3 ; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: ori r4, r4, 33437 +; P9BE-NEXT: ori r4, r4, 48149 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 5 ; P9BE-NEXT: mulli r4, r4, 98 ; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: lis r4, 528 ; P9BE-NEXT: mtfprwz f1, r3 ; P9BE-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; P9BE-NEXT: ori r4, r4, 33826 ; P9BE-NEXT: addi r3, r3, .LCPI0_0@toc@l ; P9BE-NEXT: lxv vs2, 0(r3) ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31 +; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: xxperm vs0, vs1, vs2 -; P9BE-NEXT: mulhwu r3, r3, r5 -; P9BE-NEXT: srwi r3, r3, 2 -; P9BE-NEXT: mulli r3, r3, 124 -; P9BE-NEXT: sub r3, r4, r3 -; P9BE-NEXT: lis r4, 22765 +; P9BE-NEXT: mulhwu r4, r3, r4 +; P9BE-NEXT: mulli r4, r4, 124 +; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: lis r4, 689 ; P9BE-NEXT: mtfprwz f1, r3 ; P9BE-NEXT: li r3, 0 -; P9BE-NEXT: ori r4, r4, 8969 +; P9BE-NEXT: ori r4, r4, 55879 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 ; P9BE-NEXT: mulli r4, r4, 95 ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: mtfprwz f3, r3 @@ -117,100 +101,84 @@ ; P8LE-LABEL: fold_urem_vec_1: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: lis r7, 21399 -; P8LE-NEXT: lis r9, 16727 -; P8LE-NEXT: lis r10, 8456 -; P8LE-NEXT: ori r3, r3, 8969 -; P8LE-NEXT: ori r7, r7, 33437 -; P8LE-NEXT: ori r9, r9, 2287 -; P8LE-NEXT: ori r10, r10, 16913 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: lis r8, 528 +; P8LE-NEXT: lis r9, 668 +; P8LE-NEXT: lis r10, 65 +; P8LE-NEXT: ori r3, r3, 55879 +; P8LE-NEXT: ori r8, r8, 33826 +; P8LE-NEXT: ori r9, r9, 48149 +; P8LE-NEXT: ori r10, r10, 22281 ; P8LE-NEXT: mffprd r4, f0 -; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: rldicl r5, r4, 32, 48 -; P8LE-NEXT: clrlwi r6, r6, 16 -; P8LE-NEXT: rldicl r8, r4, 16, 48 +; P8LE-NEXT: clrldi r5, r4, 48 +; P8LE-NEXT: rldicl r6, r4, 48, 48 +; P8LE-NEXT: rldicl r7, r4, 32, 48 +; P8LE-NEXT: rldicl r4, r4, 16, 48 ; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: mulhwu r3, r6, r3 -; P8LE-NEXT: rldicl r4, r4, 48, 48 -; P8LE-NEXT: clrlwi r8, r8, 16 -; P8LE-NEXT: rlwinm r11, r4, 30, 18, 31 -; P8LE-NEXT: mulhwu r7, r5, r7 +; P8LE-NEXT: clrlwi r6, r6, 16 +; P8LE-NEXT: mulhwu r3, r5, r3 +; P8LE-NEXT: clrlwi r7, r7, 16 ; P8LE-NEXT: clrlwi r4, r4, 16 -; P8LE-NEXT: mulhwu r9, r8, r9 -; P8LE-NEXT: mulhwu r10, r11, r10 -; P8LE-NEXT: sub r11, r6, r3 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: srwi r7, r7, 5 -; P8LE-NEXT: add r3, r11, r3 -; P8LE-NEXT: srwi r9, r9, 8 -; P8LE-NEXT: srwi r10, r10, 2 -; P8LE-NEXT: srwi r3, r3, 6 -; P8LE-NEXT: mulli r7, r7, 98 -; P8LE-NEXT: mulli r9, r9, 1003 +; P8LE-NEXT: mulhwu r8, r6, r8 +; P8LE-NEXT: mulhwu r9, r7, r9 +; P8LE-NEXT: mulhwu r10, r4, r10 ; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: mulli r10, r10, 124 -; P8LE-NEXT: sub r5, r5, r7 -; P8LE-NEXT: sub r7, r8, r9 -; P8LE-NEXT: sub r3, r6, r3 -; P8LE-NEXT: mtvsrd v2, r5 +; P8LE-NEXT: mulli r8, r8, 124 +; P8LE-NEXT: mulli r9, r9, 98 +; P8LE-NEXT: mulli r10, r10, 1003 +; P8LE-NEXT: sub r3, r5, r3 +; P8LE-NEXT: sub r5, r6, r8 +; P8LE-NEXT: mtvsrd v2, r3 +; P8LE-NEXT: sub r3, r7, r9 ; P8LE-NEXT: sub r4, r4, r10 -; P8LE-NEXT: mtvsrd v3, r7 +; P8LE-NEXT: mtvsrd v3, r5 ; P8LE-NEXT: mtvsrd v4, r3 ; P8LE-NEXT: mtvsrd v5, r4 ; P8LE-NEXT: vmrghh v2, v3, v2 ; P8LE-NEXT: vmrghh v3, v5, v4 -; P8LE-NEXT: xxmrglw v2, v2, v3 +; P8LE-NEXT: xxmrglw v2, v3, v2 ; P8LE-NEXT: blr ; ; P8BE-LABEL: fold_urem_vec_1: ; P8BE: # %bb.0: ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: lis r7, 16727 -; P8BE-NEXT: lis r9, 21399 -; P8BE-NEXT: lis r10, 8456 -; P8BE-NEXT: ori r3, r3, 8969 -; P8BE-NEXT: ori r7, r7, 2287 -; P8BE-NEXT: ori r9, r9, 33437 -; P8BE-NEXT: ori r10, r10, 16913 -; P8BE-NEXT: rldicl r6, r4, 16, 48 +; P8BE-NEXT: lis r3, 65 +; P8BE-NEXT: lis r8, 668 +; P8BE-NEXT: lis r9, 528 +; P8BE-NEXT: lis r10, 689 +; P8BE-NEXT: ori r3, r3, 22281 +; P8BE-NEXT: ori r8, r8, 48149 +; P8BE-NEXT: ori r9, r9, 33826 +; P8BE-NEXT: ori r10, r10, 55879 ; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: clrlwi r6, r6, 16 +; P8BE-NEXT: rldicl r6, r4, 48, 48 ; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: mulhwu r3, r6, r3 -; P8BE-NEXT: rldicl r8, r4, 48, 48 -; P8BE-NEXT: mulhwu r7, r5, r7 -; P8BE-NEXT: rldicl r4, r4, 32, 48 -; P8BE-NEXT: clrlwi r8, r8, 16 -; P8BE-NEXT: rlwinm r11, r4, 30, 18, 31 -; P8BE-NEXT: mulhwu r9, r8, r9 +; P8BE-NEXT: rldicl r7, r4, 32, 48 +; P8BE-NEXT: clrlwi r6, r6, 16 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: mulhwu r3, r5, r3 +; P8BE-NEXT: clrlwi r7, r7, 16 ; P8BE-NEXT: clrlwi r4, r4, 16 -; P8BE-NEXT: mulhwu r10, r11, r10 -; P8BE-NEXT: sub r11, r6, r3 -; P8BE-NEXT: srwi r7, r7, 8 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: mulli r7, r7, 1003 -; P8BE-NEXT: srwi r9, r9, 5 -; P8BE-NEXT: srwi r3, r3, 6 -; P8BE-NEXT: srwi r10, r10, 2 -; P8BE-NEXT: mulli r9, r9, 98 -; P8BE-NEXT: mulli r3, r3, 95 -; P8BE-NEXT: mulli r10, r10, 124 -; P8BE-NEXT: sub r5, r5, r7 -; P8BE-NEXT: addis r7, r2, .LCPI0_0@toc@ha -; P8BE-NEXT: mtvsrwz v2, r5 -; P8BE-NEXT: addi r5, r7, .LCPI0_0@toc@l -; P8BE-NEXT: sub r8, r8, r9 -; P8BE-NEXT: lxvw4x v3, 0, r5 -; P8BE-NEXT: sub r3, r6, r3 +; P8BE-NEXT: mulhwu r8, r6, r8 +; P8BE-NEXT: mulhwu r9, r7, r9 +; P8BE-NEXT: mulhwu r10, r4, r10 +; P8BE-NEXT: mulli r3, r3, 1003 +; P8BE-NEXT: mulli r8, r8, 98 +; P8BE-NEXT: mulli r9, r9, 124 +; P8BE-NEXT: mulli r10, r10, 95 +; P8BE-NEXT: sub r3, r5, r3 +; P8BE-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; P8BE-NEXT: mtvsrwz v2, r3 +; P8BE-NEXT: addi r3, r5, .LCPI0_0@toc@l +; P8BE-NEXT: sub r6, r6, r8 +; P8BE-NEXT: lxvw4x v3, 0, r3 +; P8BE-NEXT: sub r3, r7, r9 ; P8BE-NEXT: sub r4, r4, r10 -; P8BE-NEXT: mtvsrwz v4, r8 +; P8BE-NEXT: mtvsrwz v4, r6 ; P8BE-NEXT: mtvsrwz v5, r3 ; P8BE-NEXT: mtvsrwz v0, r4 ; P8BE-NEXT: vperm v2, v4, v2, v3 -; P8BE-NEXT: vperm v3, v5, v0, v3 +; P8BE-NEXT: vperm v3, v0, v5, v3 ; P8BE-NEXT: xxmrghw v2, v3, v2 ; P8BE-NEXT: blr %1 = urem <4 x i16> %x, @@ -221,15 +189,11 @@ ; P9LE-LABEL: fold_urem_vec_2: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, 22765 +; P9LE-NEXT: lis r4, 689 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 8969 +; P9LE-NEXT: ori r4, r4, 55879 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 ; P9LE-NEXT: mulli r5, r5, 95 ; P9LE-NEXT: sub r3, r3, r5 ; P9LE-NEXT: mtvsrd v3, r3 @@ -237,10 +201,6 @@ ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 ; P9LE-NEXT: mulli r5, r5, 95 ; P9LE-NEXT: sub r3, r3, r5 ; P9LE-NEXT: mtvsrd v4, r3 @@ -249,10 +209,6 @@ ; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 ; P9LE-NEXT: mulli r5, r5, 95 ; P9LE-NEXT: sub r3, r3, r5 ; P9LE-NEXT: mtvsrd v4, r3 @@ -260,10 +216,6 @@ ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: srwi r5, r5, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 6 ; P9LE-NEXT: mulli r4, r4, 95 ; P9LE-NEXT: sub r3, r3, r4 ; P9LE-NEXT: mtvsrd v2, r3 @@ -274,15 +226,11 @@ ; P9BE-LABEL: fold_urem_vec_2: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, 22765 +; P9BE-NEXT: lis r4, 689 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 8969 +; P9BE-NEXT: ori r4, r4, 55879 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r5, r3, r4 -; P9BE-NEXT: sub r6, r3, r5 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r5, r6, r5 -; P9BE-NEXT: srwi r5, r5, 6 ; P9BE-NEXT: mulli r5, r5, 95 ; P9BE-NEXT: sub r3, r3, r5 ; P9BE-NEXT: mtfprwz f0, r3 @@ -290,10 +238,6 @@ ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r5, r3, r4 -; P9BE-NEXT: sub r6, r3, r5 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r5, r6, r5 -; P9BE-NEXT: srwi r5, r5, 6 ; P9BE-NEXT: mulli r5, r5, 95 ; P9BE-NEXT: sub r3, r3, r5 ; P9BE-NEXT: mtfprwz f1, r3 @@ -305,10 +249,6 @@ ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: xxperm vs0, vs1, vs2 ; P9BE-NEXT: mulhwu r5, r3, r4 -; P9BE-NEXT: sub r6, r3, r5 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r5, r6, r5 -; P9BE-NEXT: srwi r5, r5, 6 ; P9BE-NEXT: mulli r5, r5, 95 ; P9BE-NEXT: sub r3, r3, r5 ; P9BE-NEXT: mtfprwz f1, r3 @@ -316,10 +256,6 @@ ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 ; P9BE-NEXT: mulli r4, r4, 95 ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: mtfprwz f3, r3 @@ -330,38 +266,22 @@ ; P8LE-LABEL: fold_urem_vec_2: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: ori r3, r3, 8969 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: ori r3, r3, 55879 ; P8LE-NEXT: mffprd r4, f0 ; P8LE-NEXT: clrldi r5, r4, 48 ; P8LE-NEXT: rldicl r6, r4, 48, 48 -; P8LE-NEXT: clrlwi r5, r5, 16 ; P8LE-NEXT: rldicl r7, r4, 32, 48 +; P8LE-NEXT: rldicl r4, r4, 16, 48 +; P8LE-NEXT: clrlwi r5, r5, 16 ; P8LE-NEXT: clrlwi r6, r6, 16 ; P8LE-NEXT: mulhwu r8, r5, r3 -; P8LE-NEXT: rldicl r4, r4, 16, 48 ; P8LE-NEXT: clrlwi r7, r7, 16 -; P8LE-NEXT: mulhwu r9, r6, r3 ; P8LE-NEXT: clrlwi r4, r4, 16 +; P8LE-NEXT: mulhwu r9, r6, r3 ; P8LE-NEXT: mulhwu r10, r7, r3 ; P8LE-NEXT: mulhwu r3, r4, r3 -; P8LE-NEXT: sub r11, r5, r8 -; P8LE-NEXT: sub r12, r6, r9 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: add r8, r11, r8 -; P8LE-NEXT: sub r11, r7, r10 -; P8LE-NEXT: srwi r12, r12, 1 -; P8LE-NEXT: add r9, r12, r9 -; P8LE-NEXT: sub r12, r4, r3 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: srwi r8, r8, 6 -; P8LE-NEXT: add r10, r11, r10 -; P8LE-NEXT: srwi r11, r12, 1 -; P8LE-NEXT: srwi r9, r9, 6 -; P8LE-NEXT: add r3, r11, r3 ; P8LE-NEXT: mulli r8, r8, 95 -; P8LE-NEXT: srwi r10, r10, 6 -; P8LE-NEXT: srwi r3, r3, 6 ; P8LE-NEXT: mulli r9, r9, 95 ; P8LE-NEXT: mulli r10, r10, 95 ; P8LE-NEXT: mulli r3, r3, 95 @@ -381,37 +301,21 @@ ; P8BE-LABEL: fold_urem_vec_2: ; P8BE: # %bb.0: ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 -; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: lis r3, 689 +; P8BE-NEXT: ori r3, r3, 55879 ; P8BE-NEXT: clrldi r5, r4, 48 ; P8BE-NEXT: rldicl r6, r4, 48, 48 ; P8BE-NEXT: clrlwi r5, r5, 16 ; P8BE-NEXT: rldicl r7, r4, 32, 48 ; P8BE-NEXT: clrlwi r6, r6, 16 -; P8BE-NEXT: mulhwu r8, r5, r3 ; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: mulhwu r8, r5, r3 ; P8BE-NEXT: clrlwi r7, r7, 16 -; P8BE-NEXT: mulhwu r9, r6, r3 ; P8BE-NEXT: clrlwi r4, r4, 16 +; P8BE-NEXT: mulhwu r9, r6, r3 ; P8BE-NEXT: mulhwu r10, r7, r3 ; P8BE-NEXT: mulhwu r3, r4, r3 -; P8BE-NEXT: sub r11, r5, r8 -; P8BE-NEXT: sub r12, r6, r9 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: add r8, r11, r8 -; P8BE-NEXT: sub r11, r7, r10 -; P8BE-NEXT: srwi r12, r12, 1 -; P8BE-NEXT: add r9, r12, r9 -; P8BE-NEXT: sub r12, r4, r3 -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r8, r8, 6 -; P8BE-NEXT: add r10, r11, r10 -; P8BE-NEXT: srwi r11, r12, 1 -; P8BE-NEXT: srwi r9, r9, 6 ; P8BE-NEXT: mulli r8, r8, 95 -; P8BE-NEXT: add r3, r11, r3 -; P8BE-NEXT: srwi r10, r10, 6 -; P8BE-NEXT: srwi r3, r3, 6 ; P8BE-NEXT: mulli r9, r9, 95 ; P8BE-NEXT: mulli r10, r10, 95 ; P8BE-NEXT: mulli r3, r3, 95 @@ -440,26 +344,18 @@ ; P9LE-LABEL: combine_urem_udiv: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, 22765 +; P9LE-NEXT: lis r4, 689 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 8969 +; P9LE-NEXT: ori r4, r4, 55879 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r5, r3, r4 -; P9LE-NEXT: sub r6, r3, r5 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r5, r6, r5 -; P9LE-NEXT: srwi r5, r5, 6 ; P9LE-NEXT: mulli r6, r5, 95 ; P9LE-NEXT: sub r3, r3, r6 ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 2 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: clrlwi r6, r3, 16 -; P9LE-NEXT: mulhwu r7, r6, r4 -; P9LE-NEXT: sub r6, r6, r7 -; P9LE-NEXT: srwi r6, r6, 1 -; P9LE-NEXT: add r6, r6, r7 -; P9LE-NEXT: srwi r6, r6, 6 +; P9LE-NEXT: mulhwu r6, r6, r4 ; P9LE-NEXT: mulli r7, r6, 95 ; P9LE-NEXT: sub r3, r3, r7 ; P9LE-NEXT: mtvsrd v4, r3 @@ -467,11 +363,7 @@ ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: clrlwi r7, r3, 16 -; P9LE-NEXT: mulhwu r8, r7, r4 -; P9LE-NEXT: sub r7, r7, r8 -; P9LE-NEXT: srwi r7, r7, 1 -; P9LE-NEXT: add r7, r7, r8 -; P9LE-NEXT: srwi r7, r7, 6 +; P9LE-NEXT: mulhwu r7, r7, r4 ; P9LE-NEXT: mulli r8, r7, 95 ; P9LE-NEXT: sub r3, r3, r8 ; P9LE-NEXT: mtvsrd v4, r3 @@ -479,10 +371,6 @@ ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: clrlwi r8, r3, 16 ; P9LE-NEXT: mulhwu r4, r8, r4 -; P9LE-NEXT: sub r8, r8, r4 -; P9LE-NEXT: srwi r8, r8, 1 -; P9LE-NEXT: add r4, r8, r4 -; P9LE-NEXT: srwi r4, r4, 6 ; P9LE-NEXT: mulli r8, r4, 95 ; P9LE-NEXT: mtvsrd v5, r4 ; P9LE-NEXT: sub r3, r3, r8 @@ -501,26 +389,18 @@ ; P9BE-LABEL: combine_urem_udiv: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r5, 22765 +; P9BE-NEXT: lis r5, 689 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r5, r5, 8969 +; P9BE-NEXT: ori r5, r5, 55879 ; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: mulhwu r6, r4, r5 -; P9BE-NEXT: sub r4, r4, r6 -; P9BE-NEXT: srwi r4, r4, 1 -; P9BE-NEXT: add r4, r4, r6 -; P9BE-NEXT: srwi r4, r4, 6 +; P9BE-NEXT: mulhwu r4, r4, r5 ; P9BE-NEXT: mulli r6, r4, 95 ; P9BE-NEXT: sub r3, r3, r6 ; P9BE-NEXT: mtfprwz f0, r3 ; P9BE-NEXT: li r3, 4 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r6, r3, 16 -; P9BE-NEXT: mulhwu r7, r6, r5 -; P9BE-NEXT: sub r6, r6, r7 -; P9BE-NEXT: srwi r6, r6, 1 -; P9BE-NEXT: add r6, r6, r7 -; P9BE-NEXT: srwi r6, r6, 6 +; P9BE-NEXT: mulhwu r6, r6, r5 ; P9BE-NEXT: mulli r7, r6, 95 ; P9BE-NEXT: sub r3, r3, r7 ; P9BE-NEXT: mtfprwz f1, r3 @@ -531,11 +411,7 @@ ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r7, r3, 16 ; P9BE-NEXT: xxperm vs0, vs1, vs2 -; P9BE-NEXT: mulhwu r8, r7, r5 -; P9BE-NEXT: sub r7, r7, r8 -; P9BE-NEXT: srwi r7, r7, 1 -; P9BE-NEXT: add r7, r7, r8 -; P9BE-NEXT: srwi r7, r7, 6 +; P9BE-NEXT: mulhwu r7, r7, r5 ; P9BE-NEXT: mulli r8, r7, 95 ; P9BE-NEXT: sub r3, r3, r8 ; P9BE-NEXT: mtfprwz f1, r3 @@ -543,10 +419,6 @@ ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r5, r3, r5 -; P9BE-NEXT: sub r8, r3, r5 -; P9BE-NEXT: srwi r8, r8, 1 -; P9BE-NEXT: add r5, r8, r5 -; P9BE-NEXT: srwi r5, r5, 6 ; P9BE-NEXT: mulli r8, r5, 95 ; P9BE-NEXT: sub r3, r3, r8 ; P9BE-NEXT: mtfprwz f3, r3 @@ -565,9 +437,8 @@ ; P8LE-LABEL: combine_urem_udiv: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill -; P8LE-NEXT: ori r3, r3, 8969 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: ori r3, r3, 55879 ; P8LE-NEXT: mffprd r4, f0 ; P8LE-NEXT: clrldi r5, r4, 48 ; P8LE-NEXT: rldicl r6, r4, 48, 48 @@ -576,41 +447,24 @@ ; P8LE-NEXT: rldicl r7, r4, 32, 48 ; P8LE-NEXT: rldicl r4, r4, 16, 48 ; P8LE-NEXT: mulhwu r9, r5, r3 -; P8LE-NEXT: mulhwu r11, r8, r3 +; P8LE-NEXT: mulhwu r8, r8, r3 ; P8LE-NEXT: clrlwi r10, r7, 16 -; P8LE-NEXT: clrlwi r12, r4, 16 -; P8LE-NEXT: mulhwu r0, r10, r3 -; P8LE-NEXT: mulhwu r3, r12, r3 -; P8LE-NEXT: sub r30, r5, r9 -; P8LE-NEXT: sub r8, r8, r11 -; P8LE-NEXT: srwi r30, r30, 1 -; P8LE-NEXT: srwi r8, r8, 1 -; P8LE-NEXT: sub r10, r10, r0 -; P8LE-NEXT: add r9, r30, r9 -; P8LE-NEXT: add r8, r8, r11 -; P8LE-NEXT: sub r11, r12, r3 -; P8LE-NEXT: srwi r10, r10, 1 -; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; P8LE-NEXT: srwi r9, r9, 6 -; P8LE-NEXT: srwi r11, r11, 1 -; P8LE-NEXT: srwi r8, r8, 6 -; P8LE-NEXT: add r10, r10, r0 -; P8LE-NEXT: mulli r12, r9, 95 -; P8LE-NEXT: add r3, r11, r3 +; P8LE-NEXT: clrlwi r11, r4, 16 +; P8LE-NEXT: mulhwu r10, r10, r3 +; P8LE-NEXT: mulhwu r3, r11, r3 +; P8LE-NEXT: mulli r11, r9, 95 ; P8LE-NEXT: mtvsrd v2, r9 -; P8LE-NEXT: srwi r10, r10, 6 ; P8LE-NEXT: mulli r9, r8, 95 -; P8LE-NEXT: srwi r3, r3, 6 ; P8LE-NEXT: mtvsrd v3, r8 ; P8LE-NEXT: mulli r8, r10, 95 ; P8LE-NEXT: mtvsrd v4, r10 ; P8LE-NEXT: mulli r10, r3, 95 ; P8LE-NEXT: vmrghh v2, v3, v2 -; P8LE-NEXT: sub r5, r5, r12 +; P8LE-NEXT: sub r5, r5, r11 ; P8LE-NEXT: sub r6, r6, r9 ; P8LE-NEXT: mtvsrd v3, r5 -; P8LE-NEXT: mtvsrd v5, r6 ; P8LE-NEXT: sub r5, r7, r8 +; P8LE-NEXT: mtvsrd v5, r6 ; P8LE-NEXT: sub r4, r4, r10 ; P8LE-NEXT: mtvsrd v0, r5 ; P8LE-NEXT: mtvsrd v1, r4 @@ -625,58 +479,42 @@ ; ; P8BE-LABEL: combine_urem_udiv: ; P8BE: # %bb.0: -; P8BE-NEXT: mfvsrd r5, v2 -; P8BE-NEXT: lis r4, 22765 -; P8BE-NEXT: ori r4, r4, 8969 -; P8BE-NEXT: clrldi r3, r5, 48 -; P8BE-NEXT: rldicl r6, r5, 48, 48 -; P8BE-NEXT: clrlwi r8, r3, 16 +; P8BE-NEXT: mfvsrd r4, v2 +; P8BE-NEXT: lis r3, 689 +; P8BE-NEXT: addis r11, r2, .LCPI2_0@toc@ha +; P8BE-NEXT: ori r3, r3, 55879 +; P8BE-NEXT: addi r11, r11, .LCPI2_0@toc@l +; P8BE-NEXT: clrldi r5, r4, 48 +; P8BE-NEXT: rldicl r6, r4, 48, 48 +; P8BE-NEXT: lxvw4x v2, 0, r11 +; P8BE-NEXT: clrlwi r8, r5, 16 ; P8BE-NEXT: clrlwi r9, r6, 16 -; P8BE-NEXT: rldicl r7, r5, 32, 48 -; P8BE-NEXT: rldicl r5, r5, 16, 48 -; P8BE-NEXT: mulhwu r10, r8, r4 -; P8BE-NEXT: mulhwu r12, r9, r4 -; P8BE-NEXT: clrlwi r11, r7, 16 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: mulhwu r0, r11, r4 -; P8BE-NEXT: mulhwu r4, r5, r4 -; P8BE-NEXT: sub r8, r8, r10 -; P8BE-NEXT: sub r9, r9, r12 -; P8BE-NEXT: srwi r8, r8, 1 -; P8BE-NEXT: srwi r9, r9, 1 -; P8BE-NEXT: sub r11, r11, r0 -; P8BE-NEXT: add r8, r8, r10 -; P8BE-NEXT: add r9, r9, r12 -; P8BE-NEXT: sub r12, r5, r4 -; P8BE-NEXT: addis r10, r2, .LCPI2_0@toc@ha -; P8BE-NEXT: srwi r11, r11, 1 -; P8BE-NEXT: srwi r8, r8, 6 -; P8BE-NEXT: srwi r12, r12, 1 -; P8BE-NEXT: srwi r9, r9, 6 -; P8BE-NEXT: addi r10, r10, .LCPI2_0@toc@l -; P8BE-NEXT: add r11, r11, r0 -; P8BE-NEXT: mulli r0, r8, 95 -; P8BE-NEXT: add r4, r12, r4 +; P8BE-NEXT: rldicl r7, r4, 32, 48 +; P8BE-NEXT: rldicl r4, r4, 16, 48 +; P8BE-NEXT: mulhwu r8, r8, r3 +; P8BE-NEXT: mulhwu r9, r9, r3 +; P8BE-NEXT: clrlwi r10, r7, 16 +; P8BE-NEXT: clrlwi r4, r4, 16 +; P8BE-NEXT: mulhwu r10, r10, r3 +; P8BE-NEXT: mulhwu r3, r4, r3 +; P8BE-NEXT: mulli r12, r8, 95 ; P8BE-NEXT: mtvsrwz v3, r8 -; P8BE-NEXT: lxvw4x v2, 0, r10 -; P8BE-NEXT: srwi r10, r11, 6 ; P8BE-NEXT: mulli r8, r9, 95 -; P8BE-NEXT: srwi r4, r4, 6 ; P8BE-NEXT: mtvsrwz v4, r9 ; P8BE-NEXT: mulli r9, r10, 95 ; P8BE-NEXT: mtvsrwz v5, r10 -; P8BE-NEXT: mulli r10, r4, 95 +; P8BE-NEXT: mulli r10, r3, 95 ; P8BE-NEXT: vperm v3, v4, v3, v2 -; P8BE-NEXT: sub r3, r3, r0 +; P8BE-NEXT: sub r5, r5, r12 ; P8BE-NEXT: sub r6, r6, r8 -; P8BE-NEXT: mtvsrwz v4, r3 +; P8BE-NEXT: mtvsrwz v4, r5 +; P8BE-NEXT: sub r5, r7, r9 ; P8BE-NEXT: mtvsrwz v0, r6 -; P8BE-NEXT: sub r3, r7, r9 -; P8BE-NEXT: sub r5, r5, r10 -; P8BE-NEXT: mtvsrwz v1, r3 -; P8BE-NEXT: mtvsrwz v6, r5 +; P8BE-NEXT: sub r4, r4, r10 +; P8BE-NEXT: mtvsrwz v1, r5 +; P8BE-NEXT: mtvsrwz v6, r4 ; P8BE-NEXT: vperm v4, v0, v4, v2 -; P8BE-NEXT: mtvsrwz v0, r4 +; P8BE-NEXT: mtvsrwz v0, r3 ; P8BE-NEXT: vperm v1, v6, v1, v2 ; P8BE-NEXT: vperm v2, v0, v5, v2 ; P8BE-NEXT: xxmrghw v4, v1, v4 @@ -694,9 +532,9 @@ ; P9LE-LABEL: dont_fold_urem_power_of_two: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 0 -; P9LE-NEXT: lis r4, 22765 +; P9LE-NEXT: lis r4, 689 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 8969 +; P9LE-NEXT: ori r4, r4, 55879 ; P9LE-NEXT: clrlwi r3, r3, 26 ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 2 @@ -708,10 +546,6 @@ ; P9LE-NEXT: vmrghh v3, v4, v3 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: sub r5, r3, r4 -; P9LE-NEXT: srwi r5, r5, 1 -; P9LE-NEXT: add r4, r5, r4 -; P9LE-NEXT: srwi r4, r4, 6 ; P9LE-NEXT: mulli r4, r4, 95 ; P9LE-NEXT: sub r3, r3, r4 ; P9LE-NEXT: mtvsrd v4, r3 @@ -726,9 +560,9 @@ ; P9BE-LABEL: dont_fold_urem_power_of_two: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 2 -; P9BE-NEXT: lis r4, 22765 +; P9BE-NEXT: lis r4, 689 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 8969 +; P9BE-NEXT: ori r4, r4, 55879 ; P9BE-NEXT: clrlwi r3, r3, 27 ; P9BE-NEXT: mtfprwz f0, r3 ; P9BE-NEXT: li r3, 0 @@ -743,10 +577,6 @@ ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: xxperm vs0, vs1, vs2 ; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: sub r5, r3, r4 -; P9BE-NEXT: srwi r5, r5, 1 -; P9BE-NEXT: add r4, r5, r4 -; P9BE-NEXT: srwi r4, r4, 6 ; P9BE-NEXT: mulli r4, r4, 95 ; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: mtfprwz f1, r3 @@ -761,50 +591,41 @@ ; P8LE-LABEL: dont_fold_urem_power_of_two: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, 22765 -; P8LE-NEXT: ori r3, r3, 8969 +; P8LE-NEXT: lis r3, 689 +; P8LE-NEXT: ori r3, r3, 55879 ; P8LE-NEXT: mffprd r4, f0 ; P8LE-NEXT: rldicl r5, r4, 16, 48 -; P8LE-NEXT: rldicl r7, r4, 48, 48 -; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: mulhwu r3, r5, r3 -; P8LE-NEXT: sub r6, r5, r3 -; P8LE-NEXT: srwi r6, r6, 1 -; P8LE-NEXT: add r3, r6, r3 ; P8LE-NEXT: clrldi r6, r4, 48 -; P8LE-NEXT: srwi r3, r3, 6 +; P8LE-NEXT: clrlwi r5, r5, 16 ; P8LE-NEXT: clrlwi r6, r6, 26 -; P8LE-NEXT: mulli r3, r3, 95 -; P8LE-NEXT: rldicl r4, r4, 32, 48 +; P8LE-NEXT: mulhwu r3, r5, r3 +; P8LE-NEXT: rldicl r7, r4, 48, 48 ; P8LE-NEXT: mtvsrd v2, r6 +; P8LE-NEXT: rldicl r4, r4, 32, 48 ; P8LE-NEXT: clrlwi r6, r7, 27 ; P8LE-NEXT: clrlwi r4, r4, 29 ; P8LE-NEXT: mtvsrd v3, r6 -; P8LE-NEXT: mtvsrd v5, r4 +; P8LE-NEXT: mtvsrd v4, r4 +; P8LE-NEXT: mulli r3, r3, 95 ; P8LE-NEXT: vmrghh v2, v3, v2 ; P8LE-NEXT: sub r3, r5, r3 -; P8LE-NEXT: mtvsrd v4, r3 -; P8LE-NEXT: vmrghh v3, v4, v5 +; P8LE-NEXT: mtvsrd v5, r3 +; P8LE-NEXT: vmrghh v3, v5, v4 ; P8LE-NEXT: xxmrglw v2, v3, v2 ; P8LE-NEXT: blr ; ; P8BE-LABEL: dont_fold_urem_power_of_two: ; P8BE: # %bb.0: ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 22765 +; P8BE-NEXT: lis r3, 689 ; P8BE-NEXT: addis r7, r2, .LCPI3_0@toc@ha -; P8BE-NEXT: ori r3, r3, 8969 +; P8BE-NEXT: ori r3, r3, 55879 ; P8BE-NEXT: clrldi r5, r4, 48 -; P8BE-NEXT: rldicl r8, r4, 16, 48 -; P8BE-NEXT: clrlwi r5, r5, 16 -; P8BE-NEXT: mulhwu r3, r5, r3 -; P8BE-NEXT: sub r6, r5, r3 -; P8BE-NEXT: srwi r6, r6, 1 -; P8BE-NEXT: add r3, r6, r3 ; P8BE-NEXT: rldicl r6, r4, 32, 48 -; P8BE-NEXT: srwi r3, r3, 6 +; P8BE-NEXT: clrlwi r5, r5, 16 ; P8BE-NEXT: clrlwi r6, r6, 27 -; P8BE-NEXT: mulli r3, r3, 95 +; P8BE-NEXT: mulhwu r3, r5, r3 +; P8BE-NEXT: rldicl r8, r4, 16, 48 ; P8BE-NEXT: mtvsrwz v2, r6 ; P8BE-NEXT: addi r6, r7, .LCPI3_0@toc@l ; P8BE-NEXT: rldicl r4, r4, 48, 48 @@ -813,8 +634,9 @@ ; P8BE-NEXT: clrlwi r4, r4, 29 ; P8BE-NEXT: mtvsrwz v4, r7 ; P8BE-NEXT: mtvsrwz v0, r4 -; P8BE-NEXT: sub r3, r5, r3 +; P8BE-NEXT: mulli r3, r3, 95 ; P8BE-NEXT: vperm v2, v4, v2, v3 +; P8BE-NEXT: sub r3, r5, r3 ; P8BE-NEXT: mtvsrwz v5, r3 ; P8BE-NEXT: vperm v3, v0, v5, v3 ; P8BE-NEXT: xxmrghw v2, v2, v3 @@ -828,36 +650,32 @@ ; P9LE-LABEL: dont_fold_urem_one: ; P9LE: # %bb.0: ; P9LE-NEXT: li r3, 4 -; P9LE-NEXT: lis r4, -19946 -; P9LE-NEXT: lis r5, -14230 +; P9LE-NEXT: lis r4, 2849 ; P9LE-NEXT: vextuhrx r3, r3, v2 -; P9LE-NEXT: ori r4, r4, 17097 -; P9LE-NEXT: ori r5, r5, 30865 +; P9LE-NEXT: ori r4, r4, 25645 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 4 ; P9LE-NEXT: mulli r4, r4, 23 ; P9LE-NEXT: sub r3, r3, r4 -; P9LE-NEXT: lis r4, 24749 +; P9LE-NEXT: lis r4, 12 ; P9LE-NEXT: mtvsrd v3, r3 ; P9LE-NEXT: li r3, 6 -; P9LE-NEXT: ori r4, r4, 47143 +; P9LE-NEXT: ori r4, r4, 5560 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: clrlwi r3, r3, 16 ; P9LE-NEXT: mulhwu r4, r3, r4 -; P9LE-NEXT: srwi r4, r4, 11 ; P9LE-NEXT: mulli r4, r4, 5423 ; P9LE-NEXT: sub r3, r3, r4 +; P9LE-NEXT: lis r4, 100 ; P9LE-NEXT: mtvsrd v4, r3 ; P9LE-NEXT: li r3, 2 +; P9LE-NEXT: ori r4, r4, 13629 ; P9LE-NEXT: vextuhrx r3, r3, v2 ; P9LE-NEXT: vmrghh v3, v4, v3 -; P9LE-NEXT: clrlwi r4, r3, 16 -; P9LE-NEXT: rlwinm r3, r3, 31, 17, 31 -; P9LE-NEXT: mulhwu r3, r3, r5 -; P9LE-NEXT: srwi r3, r3, 8 -; P9LE-NEXT: mulli r3, r3, 654 -; P9LE-NEXT: sub r3, r4, r3 +; P9LE-NEXT: clrlwi r3, r3, 16 +; P9LE-NEXT: mulhwu r4, r3, r4 +; P9LE-NEXT: mulli r4, r4, 654 +; P9LE-NEXT: sub r3, r3, r4 ; P9LE-NEXT: mtvsrd v2, r3 ; P9LE-NEXT: li r3, 0 ; P9LE-NEXT: mtvsrd v4, r3 @@ -868,39 +686,35 @@ ; P9BE-LABEL: dont_fold_urem_one: ; P9BE: # %bb.0: ; P9BE-NEXT: li r3, 6 -; P9BE-NEXT: lis r4, 24749 -; P9BE-NEXT: lis r5, -14230 +; P9BE-NEXT: lis r4, 12 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: ori r4, r4, 47143 -; P9BE-NEXT: ori r5, r5, 30865 +; P9BE-NEXT: ori r4, r4, 5560 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 11 ; P9BE-NEXT: mulli r4, r4, 5423 ; P9BE-NEXT: sub r3, r3, r4 -; P9BE-NEXT: lis r4, -19946 +; P9BE-NEXT: lis r4, 2849 ; P9BE-NEXT: mtfprwz f0, r3 ; P9BE-NEXT: li r3, 4 -; P9BE-NEXT: ori r4, r4, 17097 +; P9BE-NEXT: ori r4, r4, 25645 ; P9BE-NEXT: vextuhlx r3, r3, v2 ; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: mulhwu r4, r3, r4 -; P9BE-NEXT: srwi r4, r4, 4 ; P9BE-NEXT: mulli r4, r4, 23 ; P9BE-NEXT: sub r3, r3, r4 +; P9BE-NEXT: lis r4, 100 ; P9BE-NEXT: mtfprwz f1, r3 ; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; P9BE-NEXT: ori r4, r4, 13629 ; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l ; P9BE-NEXT: lxv vs2, 0(r3) ; P9BE-NEXT: li r3, 2 ; P9BE-NEXT: vextuhlx r3, r3, v2 -; P9BE-NEXT: clrlwi r4, r3, 16 -; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31 +; P9BE-NEXT: clrlwi r3, r3, 16 ; P9BE-NEXT: xxperm vs0, vs1, vs2 -; P9BE-NEXT: mulhwu r3, r3, r5 -; P9BE-NEXT: srwi r3, r3, 8 -; P9BE-NEXT: mulli r3, r3, 654 -; P9BE-NEXT: sub r3, r4, r3 +; P9BE-NEXT: mulhwu r4, r3, r4 +; P9BE-NEXT: mulli r4, r4, 654 +; P9BE-NEXT: sub r3, r3, r4 ; P9BE-NEXT: mtfprwz f1, r3 ; P9BE-NEXT: li r3, 0 ; P9BE-NEXT: mtfprwz f3, r3 @@ -911,29 +725,25 @@ ; P8LE-LABEL: dont_fold_urem_one: ; P8LE: # %bb.0: ; P8LE-NEXT: xxswapd vs0, v2 -; P8LE-NEXT: lis r3, -14230 -; P8LE-NEXT: lis r7, -19946 -; P8LE-NEXT: lis r9, 24749 -; P8LE-NEXT: ori r3, r3, 30865 -; P8LE-NEXT: ori r7, r7, 17097 +; P8LE-NEXT: lis r3, 100 +; P8LE-NEXT: lis r7, 2849 +; P8LE-NEXT: lis r8, 12 +; P8LE-NEXT: li r9, 0 +; P8LE-NEXT: ori r3, r3, 13629 +; P8LE-NEXT: ori r7, r7, 25645 +; P8LE-NEXT: ori r8, r8, 5560 +; P8LE-NEXT: mtvsrd v2, r9 ; P8LE-NEXT: mffprd r4, f0 ; P8LE-NEXT: rldicl r5, r4, 48, 48 ; P8LE-NEXT: rldicl r6, r4, 32, 48 ; P8LE-NEXT: rldicl r4, r4, 16, 48 -; P8LE-NEXT: rlwinm r8, r5, 31, 17, 31 -; P8LE-NEXT: clrlwi r6, r6, 16 ; P8LE-NEXT: clrlwi r5, r5, 16 -; P8LE-NEXT: mulhwu r3, r8, r3 -; P8LE-NEXT: ori r8, r9, 47143 +; P8LE-NEXT: clrlwi r6, r6, 16 +; P8LE-NEXT: mulhwu r3, r5, r3 ; P8LE-NEXT: clrlwi r4, r4, 16 -; P8LE-NEXT: li r9, 0 ; P8LE-NEXT: mulhwu r7, r6, r7 ; P8LE-NEXT: mulhwu r8, r4, r8 -; P8LE-NEXT: mtvsrd v2, r9 -; P8LE-NEXT: srwi r3, r3, 8 -; P8LE-NEXT: srwi r7, r7, 4 ; P8LE-NEXT: mulli r3, r3, 654 -; P8LE-NEXT: srwi r8, r8, 11 ; P8LE-NEXT: mulli r7, r7, 23 ; P8LE-NEXT: mulli r8, r8, 5423 ; P8LE-NEXT: sub r3, r5, r3 @@ -950,37 +760,33 @@ ; P8BE-LABEL: dont_fold_urem_one: ; P8BE: # %bb.0: ; P8BE-NEXT: mfvsrd r4, v2 -; P8BE-NEXT: lis r3, 24749 -; P8BE-NEXT: lis r7, -19946 -; P8BE-NEXT: lis r8, -14230 +; P8BE-NEXT: lis r3, 12 +; P8BE-NEXT: lis r7, 2849 +; P8BE-NEXT: lis r8, 100 +; P8BE-NEXT: addis r9, r2, .LCPI4_0@toc@ha ; P8BE-NEXT: li r10, 0 -; P8BE-NEXT: ori r3, r3, 47143 -; P8BE-NEXT: ori r7, r7, 17097 -; P8BE-NEXT: ori r8, r8, 30865 +; P8BE-NEXT: ori r3, r3, 5560 +; P8BE-NEXT: ori r7, r7, 25645 +; P8BE-NEXT: ori r8, r8, 13629 ; P8BE-NEXT: mtvsrwz v2, r10 ; P8BE-NEXT: clrldi r5, r4, 48 ; P8BE-NEXT: rldicl r6, r4, 48, 48 -; P8BE-NEXT: clrlwi r5, r5, 16 ; P8BE-NEXT: rldicl r4, r4, 32, 48 +; P8BE-NEXT: clrlwi r5, r5, 16 ; P8BE-NEXT: clrlwi r6, r6, 16 ; P8BE-NEXT: mulhwu r3, r5, r3 -; P8BE-NEXT: rlwinm r9, r4, 31, 17, 31 +; P8BE-NEXT: clrlwi r4, r4, 16 ; P8BE-NEXT: mulhwu r7, r6, r7 -; P8BE-NEXT: mulhwu r8, r9, r8 -; P8BE-NEXT: addis r9, r2, .LCPI4_0@toc@ha -; P8BE-NEXT: srwi r3, r3, 11 +; P8BE-NEXT: mulhwu r8, r4, r8 ; P8BE-NEXT: mulli r3, r3, 5423 -; P8BE-NEXT: srwi r7, r7, 4 -; P8BE-NEXT: srwi r8, r8, 8 ; P8BE-NEXT: mulli r7, r7, 23 ; P8BE-NEXT: mulli r8, r8, 654 ; P8BE-NEXT: sub r3, r5, r3 ; P8BE-NEXT: addi r5, r9, .LCPI4_0@toc@l -; P8BE-NEXT: mtvsrwz v4, r3 -; P8BE-NEXT: clrlwi r3, r4, 16 ; P8BE-NEXT: lxvw4x v3, 0, r5 ; P8BE-NEXT: sub r5, r6, r7 -; P8BE-NEXT: sub r3, r3, r8 +; P8BE-NEXT: mtvsrwz v4, r3 +; P8BE-NEXT: sub r3, r4, r8 ; P8BE-NEXT: mtvsrwz v5, r5 ; P8BE-NEXT: mtvsrwz v0, r3 ; P8BE-NEXT: vperm v4, v5, v4, v3 diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -58,35 +58,27 @@ ; RV32IM-NEXT: lhu a3, 8(a1) ; RV32IM-NEXT: lhu a4, 0(a1) ; RV32IM-NEXT: lhu a1, 4(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: lui a5, 11038 +; RV32IM-NEXT: addi a5, a5, -1465 ; RV32IM-NEXT: mulhu a5, a4, a5 -; RV32IM-NEXT: sub a6, a4, a5 -; RV32IM-NEXT: srli a6, a6, 1 -; RV32IM-NEXT: add a5, a6, a5 -; RV32IM-NEXT: srli a5, a5, 6 ; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: mul a5, a5, a6 ; RV32IM-NEXT: sub a4, a4, a5 -; RV32IM-NEXT: srli a5, a1, 2 -; RV32IM-NEXT: lui a6, 135300 -; RV32IM-NEXT: addi a6, a6, 529 -; RV32IM-NEXT: mulhu a5, a5, a6 -; RV32IM-NEXT: srli a5, a5, 2 +; RV32IM-NEXT: lui a5, 8456 +; RV32IM-NEXT: addi a5, a5, 1058 +; RV32IM-NEXT: mulhu a5, a1, a5 ; RV32IM-NEXT: li a6, 124 ; RV32IM-NEXT: mul a5, a5, a6 ; RV32IM-NEXT: sub a1, a1, a5 -; RV32IM-NEXT: lui a5, 342392 -; RV32IM-NEXT: addi a5, a5, 669 +; RV32IM-NEXT: lui a5, 10700 +; RV32IM-NEXT: addi a5, a5, -1003 ; RV32IM-NEXT: mulhu a5, a3, a5 -; RV32IM-NEXT: srli a5, a5, 5 ; RV32IM-NEXT: li a6, 98 ; RV32IM-NEXT: mul a5, a5, a6 ; RV32IM-NEXT: sub a3, a3, a5 -; RV32IM-NEXT: lui a5, 267633 -; RV32IM-NEXT: addi a5, a5, -1809 +; RV32IM-NEXT: lui a5, 1045 +; RV32IM-NEXT: addi a5, a5, 1801 ; RV32IM-NEXT: mulhu a5, a2, a5 -; RV32IM-NEXT: srli a5, a5, 8 ; RV32IM-NEXT: li a6, 1003 ; RV32IM-NEXT: mul a5, a5, a6 ; RV32IM-NEXT: sub a2, a2, a5 @@ -147,33 +139,24 @@ ; RV64IM-NEXT: lhu a5, 16(a1) ; RV64IM-NEXT: lhu a1, 8(a1) ; RV64IM-NEXT: mulhu a3, a2, a3 -; RV64IM-NEXT: sub a6, a2, a3 -; RV64IM-NEXT: srli a6, a6, 1 -; RV64IM-NEXT: add a3, a6, a3 -; RV64IM-NEXT: srli a3, a3, 6 -; RV64IM-NEXT: li a6, 95 -; RV64IM-NEXT: lui a7, %hi(.LCPI0_1) -; RV64IM-NEXT: ld a7, %lo(.LCPI0_1)(a7) -; RV64IM-NEXT: mulw a3, a3, a6 +; RV64IM-NEXT: lui a6, %hi(.LCPI0_1) +; RV64IM-NEXT: ld a6, %lo(.LCPI0_1)(a6) +; RV64IM-NEXT: li a7, 95 +; RV64IM-NEXT: mulw a3, a3, a7 ; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: srli a3, a1, 2 -; RV64IM-NEXT: mulhu a3, a3, a7 -; RV64IM-NEXT: srli a3, a3, 3 -; RV64IM-NEXT: li a6, 124 -; RV64IM-NEXT: lui a7, %hi(.LCPI0_2) -; RV64IM-NEXT: ld a7, %lo(.LCPI0_2)(a7) -; RV64IM-NEXT: mulw a3, a3, a6 +; RV64IM-NEXT: mulhu a3, a1, a6 +; RV64IM-NEXT: lui a6, %hi(.LCPI0_2) +; RV64IM-NEXT: ld a6, %lo(.LCPI0_2)(a6) +; RV64IM-NEXT: li a7, 124 +; RV64IM-NEXT: mulw a3, a3, a7 ; RV64IM-NEXT: subw a1, a1, a3 -; RV64IM-NEXT: srli a3, a5, 1 -; RV64IM-NEXT: mulhu a3, a3, a7 -; RV64IM-NEXT: srli a3, a3, 4 +; RV64IM-NEXT: mulhu a3, a5, a6 ; RV64IM-NEXT: lui a6, %hi(.LCPI0_3) ; RV64IM-NEXT: ld a6, %lo(.LCPI0_3)(a6) ; RV64IM-NEXT: li a7, 98 ; RV64IM-NEXT: mulw a3, a3, a7 ; RV64IM-NEXT: subw a5, a5, a3 ; RV64IM-NEXT: mulhu a3, a4, a6 -; RV64IM-NEXT: srli a3, a3, 7 ; RV64IM-NEXT: li a6, 1003 ; RV64IM-NEXT: mulw a3, a3, a6 ; RV64IM-NEXT: subw a4, a4, a3 @@ -235,35 +218,19 @@ ; RV32IM-NEXT: lhu a3, 8(a1) ; RV32IM-NEXT: lhu a4, 0(a1) ; RV32IM-NEXT: lhu a1, 4(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: lui a5, 11038 +; RV32IM-NEXT: addi a5, a5, -1465 ; RV32IM-NEXT: mulhu a6, a4, a5 -; RV32IM-NEXT: sub a7, a4, a6 -; RV32IM-NEXT: srli a7, a7, 1 -; RV32IM-NEXT: add a6, a7, a6 -; RV32IM-NEXT: srli a6, a6, 6 ; RV32IM-NEXT: li a7, 95 ; RV32IM-NEXT: mul a6, a6, a7 ; RV32IM-NEXT: sub a4, a4, a6 ; RV32IM-NEXT: mulhu a6, a1, a5 -; RV32IM-NEXT: sub t0, a1, a6 -; RV32IM-NEXT: srli t0, t0, 1 -; RV32IM-NEXT: add a6, t0, a6 -; RV32IM-NEXT: srli a6, a6, 6 ; RV32IM-NEXT: mul a6, a6, a7 ; RV32IM-NEXT: sub a1, a1, a6 ; RV32IM-NEXT: mulhu a6, a3, a5 -; RV32IM-NEXT: sub t0, a3, a6 -; RV32IM-NEXT: srli t0, t0, 1 -; RV32IM-NEXT: add a6, t0, a6 -; RV32IM-NEXT: srli a6, a6, 6 ; RV32IM-NEXT: mul a6, a6, a7 ; RV32IM-NEXT: sub a3, a3, a6 ; RV32IM-NEXT: mulhu a5, a2, a5 -; RV32IM-NEXT: sub a6, a2, a5 -; RV32IM-NEXT: srli a6, a6, 1 -; RV32IM-NEXT: add a5, a6, a5 -; RV32IM-NEXT: srli a5, a5, 6 ; RV32IM-NEXT: mul a5, a5, a7 ; RV32IM-NEXT: sub a2, a2, a5 ; RV32IM-NEXT: sh a2, 6(a0) @@ -323,32 +290,16 @@ ; RV64IM-NEXT: lhu a5, 16(a1) ; RV64IM-NEXT: lhu a1, 8(a1) ; RV64IM-NEXT: mulhu a6, a2, a3 -; RV64IM-NEXT: sub a7, a2, a6 -; RV64IM-NEXT: srli a7, a7, 1 -; RV64IM-NEXT: add a6, a7, a6 -; RV64IM-NEXT: srli a6, a6, 6 ; RV64IM-NEXT: li a7, 95 ; RV64IM-NEXT: mulw a6, a6, a7 ; RV64IM-NEXT: subw a2, a2, a6 ; RV64IM-NEXT: mulhu a6, a1, a3 -; RV64IM-NEXT: sub t0, a1, a6 -; RV64IM-NEXT: srli t0, t0, 1 -; RV64IM-NEXT: add a6, t0, a6 -; RV64IM-NEXT: srli a6, a6, 6 ; RV64IM-NEXT: mulw a6, a6, a7 ; RV64IM-NEXT: subw a1, a1, a6 ; RV64IM-NEXT: mulhu a6, a5, a3 -; RV64IM-NEXT: sub t0, a5, a6 -; RV64IM-NEXT: srli t0, t0, 1 -; RV64IM-NEXT: add a6, t0, a6 -; RV64IM-NEXT: srli a6, a6, 6 ; RV64IM-NEXT: mulw a6, a6, a7 ; RV64IM-NEXT: subw a5, a5, a6 ; RV64IM-NEXT: mulhu a3, a4, a3 -; RV64IM-NEXT: sub a6, a4, a3 -; RV64IM-NEXT: srli a6, a6, 1 -; RV64IM-NEXT: add a3, a6, a3 -; RV64IM-NEXT: srli a3, a3, 6 ; RV64IM-NEXT: mulw a3, a3, a7 ; RV64IM-NEXT: subw a4, a4, a3 ; RV64IM-NEXT: sh a4, 6(a0) @@ -439,32 +390,16 @@ ; RV32IM-NEXT: lhu a3, 4(a1) ; RV32IM-NEXT: lhu a4, 12(a1) ; RV32IM-NEXT: lhu a1, 8(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: lui a5, 11038 +; RV32IM-NEXT: addi a5, a5, -1465 ; RV32IM-NEXT: mulhu a6, a4, a5 -; RV32IM-NEXT: sub a7, a4, a6 -; RV32IM-NEXT: srli a7, a7, 1 -; RV32IM-NEXT: add a6, a7, a6 -; RV32IM-NEXT: srli a6, a6, 6 ; RV32IM-NEXT: li a7, 95 ; RV32IM-NEXT: mul t0, a6, a7 ; RV32IM-NEXT: mulhu t1, a1, a5 -; RV32IM-NEXT: sub t2, a1, t1 -; RV32IM-NEXT: srli t2, t2, 1 -; RV32IM-NEXT: add t1, t2, t1 -; RV32IM-NEXT: srli t1, t1, 6 ; RV32IM-NEXT: mul t2, t1, a7 ; RV32IM-NEXT: mulhu t3, a3, a5 -; RV32IM-NEXT: sub t4, a3, t3 -; RV32IM-NEXT: srli t4, t4, 1 -; RV32IM-NEXT: add t3, t4, t3 -; RV32IM-NEXT: srli t3, t3, 6 ; RV32IM-NEXT: mul t4, t3, a7 ; RV32IM-NEXT: mulhu a5, a2, a5 -; RV32IM-NEXT: sub t5, a2, a5 -; RV32IM-NEXT: srli t5, t5, 1 -; RV32IM-NEXT: add a5, t5, a5 -; RV32IM-NEXT: srli a5, a5, 6 ; RV32IM-NEXT: mul a7, a5, a7 ; RV32IM-NEXT: sub a5, a7, a5 ; RV32IM-NEXT: sub a2, a2, a5 @@ -559,29 +494,13 @@ ; RV64IM-NEXT: lhu a5, 8(a1) ; RV64IM-NEXT: lhu a1, 16(a1) ; RV64IM-NEXT: mulhu a6, a2, a3 -; RV64IM-NEXT: sub a7, a2, a6 -; RV64IM-NEXT: srli a7, a7, 1 -; RV64IM-NEXT: add a6, a7, a6 -; RV64IM-NEXT: srli a6, a6, 6 ; RV64IM-NEXT: li a7, 95 ; RV64IM-NEXT: mulw t0, a6, a7 ; RV64IM-NEXT: mulhu t1, a1, a3 -; RV64IM-NEXT: sub t2, a1, t1 -; RV64IM-NEXT: srli t2, t2, 1 -; RV64IM-NEXT: add t1, t2, t1 -; RV64IM-NEXT: srli t1, t1, 6 ; RV64IM-NEXT: mulw t2, t1, a7 ; RV64IM-NEXT: mulhu t3, a5, a3 -; RV64IM-NEXT: sub t4, a5, t3 -; RV64IM-NEXT: srli t4, t4, 1 -; RV64IM-NEXT: add t3, t4, t3 -; RV64IM-NEXT: srli t3, t3, 6 ; RV64IM-NEXT: mulw t4, t3, a7 ; RV64IM-NEXT: mulhu a3, a4, a3 -; RV64IM-NEXT: sub t5, a4, a3 -; RV64IM-NEXT: srli t5, t5, 1 -; RV64IM-NEXT: add a3, t5, a3 -; RV64IM-NEXT: srli a3, a3, 6 ; RV64IM-NEXT: mulw a7, a3, a7 ; RV64IM-NEXT: subw a3, a7, a3 ; RV64IM-NEXT: subw a4, a4, a3 @@ -641,13 +560,9 @@ ; RV32IM-NEXT: lhu a3, 4(a1) ; RV32IM-NEXT: lhu a4, 12(a1) ; RV32IM-NEXT: lhu a1, 0(a1) -; RV32IM-NEXT: lui a5, 364242 -; RV32IM-NEXT: addi a5, a5, 777 +; RV32IM-NEXT: lui a5, 11038 +; RV32IM-NEXT: addi a5, a5, -1465 ; RV32IM-NEXT: mulhu a5, a4, a5 -; RV32IM-NEXT: sub a6, a4, a5 -; RV32IM-NEXT: srli a6, a6, 1 -; RV32IM-NEXT: add a5, a6, a5 -; RV32IM-NEXT: srli a5, a5, 6 ; RV32IM-NEXT: li a6, 95 ; RV32IM-NEXT: mul a5, a5, a6 ; RV32IM-NEXT: sub a4, a4, a5 @@ -700,10 +615,6 @@ ; RV64IM-NEXT: lhu a5, 8(a1) ; RV64IM-NEXT: lhu a1, 0(a1) ; RV64IM-NEXT: mulhu a3, a2, a3 -; RV64IM-NEXT: sub a6, a2, a3 -; RV64IM-NEXT: srli a6, a6, 1 -; RV64IM-NEXT: add a3, a6, a3 -; RV64IM-NEXT: srli a3, a3, 6 ; RV64IM-NEXT: li a6, 95 ; RV64IM-NEXT: mulw a3, a3, a6 ; RV64IM-NEXT: subw a2, a2, a3 @@ -759,36 +670,32 @@ ; ; RV32IM-LABEL: dont_fold_urem_one: ; RV32IM: # %bb.0: -; RV32IM-NEXT: lhu a2, 4(a1) -; RV32IM-NEXT: lhu a3, 12(a1) +; RV32IM-NEXT: lhu a2, 12(a1) +; RV32IM-NEXT: lhu a3, 4(a1) ; RV32IM-NEXT: lhu a1, 8(a1) -; RV32IM-NEXT: srli a4, a2, 1 -; RV32IM-NEXT: lui a5, 820904 -; RV32IM-NEXT: addi a5, a5, -1903 -; RV32IM-NEXT: mulhu a4, a4, a5 -; RV32IM-NEXT: srli a4, a4, 8 +; RV32IM-NEXT: lui a4, 1603 +; RV32IM-NEXT: addi a4, a4, 1341 +; RV32IM-NEXT: mulhu a4, a3, a4 ; RV32IM-NEXT: li a5, 654 ; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a2, a2, a4 -; RV32IM-NEXT: lui a4, 729444 -; RV32IM-NEXT: addi a4, a4, 713 +; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: lui a4, 45590 +; RV32IM-NEXT: addi a4, a4, 1069 ; RV32IM-NEXT: mulhu a4, a1, a4 -; RV32IM-NEXT: srli a4, a4, 4 ; RV32IM-NEXT: li a5, 23 ; RV32IM-NEXT: mul a4, a4, a5 ; RV32IM-NEXT: sub a1, a1, a4 -; RV32IM-NEXT: lui a4, 395996 -; RV32IM-NEXT: addi a4, a4, -2009 -; RV32IM-NEXT: mulhu a4, a3, a4 -; RV32IM-NEXT: srli a4, a4, 11 +; RV32IM-NEXT: lui a4, 193 +; RV32IM-NEXT: addi a4, a4, 1464 +; RV32IM-NEXT: mulhu a4, a2, a4 ; RV32IM-NEXT: lui a5, 1 ; RV32IM-NEXT: addi a5, a5, 1327 ; RV32IM-NEXT: mul a4, a4, a5 -; RV32IM-NEXT: sub a3, a3, a4 +; RV32IM-NEXT: sub a2, a2, a4 ; RV32IM-NEXT: sh zero, 0(a0) -; RV32IM-NEXT: sh a3, 6(a0) +; RV32IM-NEXT: sh a2, 6(a0) ; RV32IM-NEXT: sh a1, 4(a0) -; RV32IM-NEXT: sh a2, 2(a0) +; RV32IM-NEXT: sh a3, 2(a0) ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_one: @@ -829,39 +736,32 @@ ; ; RV64IM-LABEL: dont_fold_urem_one: ; RV64IM: # %bb.0: -; RV64IM-NEXT: lhu a2, 16(a1) +; RV64IM-NEXT: lhu a2, 8(a1) ; RV64IM-NEXT: lui a3, %hi(.LCPI4_0) ; RV64IM-NEXT: ld a3, %lo(.LCPI4_0)(a3) ; RV64IM-NEXT: lhu a4, 24(a1) -; RV64IM-NEXT: lhu a1, 8(a1) +; RV64IM-NEXT: lhu a1, 16(a1) ; RV64IM-NEXT: mulhu a3, a2, a3 -; RV64IM-NEXT: sub a5, a2, a3 -; RV64IM-NEXT: srli a5, a5, 1 -; RV64IM-NEXT: add a3, a5, a3 -; RV64IM-NEXT: srli a3, a3, 4 -; RV64IM-NEXT: li a5, 23 -; RV64IM-NEXT: lui a6, %hi(.LCPI4_1) -; RV64IM-NEXT: ld a6, %lo(.LCPI4_1)(a6) -; RV64IM-NEXT: mulw a3, a3, a5 +; RV64IM-NEXT: lui a5, %hi(.LCPI4_1) +; RV64IM-NEXT: ld a5, %lo(.LCPI4_1)(a5) +; RV64IM-NEXT: li a6, 654 +; RV64IM-NEXT: mulw a3, a3, a6 ; RV64IM-NEXT: subw a2, a2, a3 -; RV64IM-NEXT: srli a3, a1, 1 -; RV64IM-NEXT: mulhu a3, a3, a6 -; RV64IM-NEXT: srli a3, a3, 7 +; RV64IM-NEXT: mulhu a3, a1, a5 ; RV64IM-NEXT: lui a5, %hi(.LCPI4_2) ; RV64IM-NEXT: ld a5, %lo(.LCPI4_2)(a5) -; RV64IM-NEXT: li a6, 654 +; RV64IM-NEXT: li a6, 23 ; RV64IM-NEXT: mulw a3, a3, a6 ; RV64IM-NEXT: subw a1, a1, a3 ; RV64IM-NEXT: mulhu a3, a4, a5 -; RV64IM-NEXT: srli a3, a3, 12 ; RV64IM-NEXT: lui a5, 1 ; RV64IM-NEXT: addiw a5, a5, 1327 ; RV64IM-NEXT: mulw a3, a3, a5 ; RV64IM-NEXT: subw a4, a4, a3 ; RV64IM-NEXT: sh zero, 0(a0) ; RV64IM-NEXT: sh a4, 6(a0) -; RV64IM-NEXT: sh a1, 2(a0) -; RV64IM-NEXT: sh a2, 4(a0) +; RV64IM-NEXT: sh a1, 4(a0) +; RV64IM-NEXT: sh a2, 2(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, ret <4 x i16> %1 diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -320,10 +320,7 @@ ; X64-FAST-NEXT: movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493 ; X64-FAST-NEXT: movq %rdi, %rax ; X64-FAST-NEXT: mulq %rcx -; X64-FAST-NEXT: subq %rdx, %rdi -; X64-FAST-NEXT: shrq %rdi -; X64-FAST-NEXT: leaq (%rdi,%rdx), %rax -; X64-FAST-NEXT: shrq $2, %rax +; X64-FAST-NEXT: movq %rdx, %rax ; X64-FAST-NEXT: retq ; ; X64-SLOW-LABEL: PR23590: @@ -336,11 +333,6 @@ ; X64-SLOW-NEXT: subq %rax, %rdi ; X64-SLOW-NEXT: imulq $613566757, %rdi, %rax # imm = 0x24924925 ; X64-SLOW-NEXT: shrq $32, %rax -; X64-SLOW-NEXT: subl %eax, %edi -; X64-SLOW-NEXT: shrl %edi -; X64-SLOW-NEXT: addl %eax, %edi -; X64-SLOW-NEXT: shrl $2, %edi -; X64-SLOW-NEXT: movq %rdi, %rax ; X64-SLOW-NEXT: retq entry: %rem = urem i64 %x, 12345 diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -178,9 +178,8 @@ ; X64-AVX2-NEXT: movq %rdx, %rcx ; X64-AVX2-NEXT: movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF ; X64-AVX2-NEXT: andq %rdx, %rax -; X64-AVX2-NEXT: movabsq $-2492803253203993461, %rdx # imm = 0xDD67C8A60DD67C8B +; X64-AVX2-NEXT: movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5 ; X64-AVX2-NEXT: mulq %rdx -; X64-AVX2-NEXT: shrq $5, %rdx ; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax ; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax ; X64-AVX2-NEXT: subl %eax, %ecx @@ -346,9 +345,8 @@ ; X64-AVX2-NEXT: movq %rdx, %rcx ; X64-AVX2-NEXT: movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF ; X64-AVX2-NEXT: andq %rdx, %rax -; X64-AVX2-NEXT: movabsq $-2492803253203993461, %rdx # imm = 0xDD67C8A60DD67C8B +; X64-AVX2-NEXT: movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5 ; X64-AVX2-NEXT: mulq %rdx -; X64-AVX2-NEXT: shrq $5, %rdx ; X64-AVX2-NEXT: leal (%rdx,%rdx,8), %eax ; X64-AVX2-NEXT: leal (%rdx,%rax,4), %eax ; X64-AVX2-NEXT: subl %eax, %ecx diff --git a/llvm/test/CodeGen/X86/pr38217.ll b/llvm/test/CodeGen/X86/pr38217.ll --- a/llvm/test/CodeGen/X86/pr38217.ll +++ b/llvm/test/CodeGen/X86/pr38217.ll @@ -19,8 +19,8 @@ ; CHECK-NEXT: imulq $10000, %rdx, %rax # imm = 0x2710 ; CHECK-NEXT: movq %rdi, %r9 ; CHECK-NEXT: subq %rax, %r9 -; CHECK-NEXT: imulq $1374389535, %r9, %rax # imm = 0x51EB851F -; CHECK-NEXT: shrq $37, %rax +; CHECK-NEXT: imulq $42949673, %r9, %rax # imm = 0x28F5C29 +; CHECK-NEXT: shrq $32, %rax ; CHECK-NEXT: imull $100, %eax, %r10d ; CHECK-NEXT: subl %r10d, %r9d ; CHECK-NEXT: movl %ecx, %r10d diff --git a/llvm/unittests/Support/DivisionByConstantTest.cpp b/llvm/unittests/Support/DivisionByConstantTest.cpp --- a/llvm/unittests/Support/DivisionByConstantTest.cpp +++ b/llvm/unittests/Support/DivisionByConstantTest.cpp @@ -97,11 +97,22 @@ return (X.zext(WideBits) * Y.zext(WideBits)).lshr(Bits).trunc(Bits); } -APInt UnsignedDivideUsingMagic(APInt Numerator, APInt Divisor, +APInt UnsignedDivideUsingMagic(const APInt &Numerator, const APInt &Divisor, + bool LZOptimization, bool AllowEvenDivisorOptimization, bool ForceNPQ, UnsignedDivisionByConstantInfo Magics) { unsigned Bits = Numerator.getBitWidth(); + if (LZOptimization && !Divisor.isOne()) { + unsigned LeadingZeros = Numerator.countLeadingZeros(); + // Clip to the number of leading zeros in the divisor. + LeadingZeros = std::min(LeadingZeros, Divisor.countLeadingZeros()); + if (LeadingZeros > 0) { + Magics = UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros); + assert(!Magics.IsAdd && "Should use cheap fixup now"); + } + } + unsigned PreShift = 0; if (AllowEvenDivisorOptimization) { // If the divisor is even, we can avoid using the expensive fixup by @@ -159,7 +170,7 @@ for (unsigned Bits = 1; Bits <= 32; ++Bits) { if (Bits < 2) continue; // Not supported by `UnsignedDivisionByConstantInfo::get()`. - if (Bits > 11) + if (Bits > 10) continue; // Unreasonably slow. EnumerateAPInts(Bits, [Bits](const APInt &Divisor) { if (Divisor.isZero()) @@ -168,17 +179,20 @@ UnsignedDivisionByConstantInfo::get(Divisor); EnumerateAPInts(Bits, [Divisor, Magics, Bits](const APInt &Numerator) { APInt NativeResult = Numerator.udiv(Divisor); - for (bool AllowEvenDivisorOptimization : {true, false}) { - for (bool ForceNPQ : {false, true}) { - APInt MagicResult = UnsignedDivideUsingMagic( - Numerator, Divisor, AllowEvenDivisorOptimization, ForceNPQ, - Magics); - ASSERT_EQ(MagicResult, NativeResult) - << " ... given the operation: urem i" << Bits << " " - << Numerator << ", " << Divisor - << " (allow even divisior optimization = " - << AllowEvenDivisorOptimization << ", force NPQ = " << ForceNPQ - << ")"; + for (bool LZOptimization : {true, false}) { + for (bool AllowEvenDivisorOptimization : {true, false}) { + for (bool ForceNPQ : {false, true}) { + APInt MagicResult = UnsignedDivideUsingMagic( + Numerator, Divisor, LZOptimization, + AllowEvenDivisorOptimization, ForceNPQ, Magics); + ASSERT_EQ(MagicResult, NativeResult) + << " ... given the operation: urem i" << Bits << " " + << Numerator << ", " << Divisor + << " (allow LZ optimization = " + << LZOptimization << ", allow even divisior optimization = " + << AllowEvenDivisorOptimization << ", force NPQ = " + << ForceNPQ << ")"; + } } } });