diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3939,6 +3939,30 @@ return SDValue(); } +// Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y)) +static SDValue foldSraMulToAndNeg(SDNode *N, SDValue N0, SDValue N1, + SelectionDAG &DAG) { + if (N0.getOpcode() != ISD::SRA) + return SDValue(); + + EVT VT = N->getValueType(0); + + // TODO: Use computeNumSignBits() == BitWidth? + unsigned BitWidth = VT.getScalarSizeInBits(); + ConstantSDNode *ShiftAmt = isConstOrConstSplat(N0.getOperand(1)); + if (!ShiftAmt || ShiftAmt->getAPIntValue() != (BitWidth - 1)) + return SDValue(); + + // If optimizing for minsize, we don't want to increase the number of + // instructions. + if (DAG.getMachineFunction().getFunction().hasMinSize()) + return SDValue(); + + SDLoc dl(N); + SDValue And = DAG.getNode(ISD::AND, dl, VT, N0, N1); + return DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), And); +} + SDValue DAGCombiner::visitMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4149,6 +4173,11 @@ } } + if (SDValue V = foldSraMulToAndNeg(N, N0, N1, DAG)) + return V; + if (SDValue V = foldSraMulToAndNeg(N, N1, N0, DAG)) + return V; + // reassociate mul if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags())) return RMUL; diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll @@ -39,21 +39,24 @@ ; AARCH: // %bb.0: // %Entry ; AARCH-NEXT: asr x9, x1, #63 ; AARCH-NEXT: asr x10, x3, #63 +; AARCH-NEXT: and x11, x9, x2 +; AARCH-NEXT: and x14, x10, x1 +; AARCH-NEXT: umulh x12, x2, x9 +; AARCH-NEXT: and x9, x9, x3 +; AARCH-NEXT: umulh x13, x10, x0 +; AARCH-NEXT: and x10, x10, x0 +; AARCH-NEXT: sub x12, x12, x11 +; AARCH-NEXT: neg x11, x11 +; AARCH-NEXT: sub x13, x13, x14 +; AARCH-NEXT: sub x9, x12, x9 +; AARCH-NEXT: sub x12, x13, x10 +; AARCH-NEXT: neg x10, x10 ; AARCH-NEXT: umulh x14, x0, x2 -; AARCH-NEXT: mov x8, x1 -; AARCH-NEXT: mul x11, x2, x9 -; AARCH-NEXT: str wzr, [x4] -; AARCH-NEXT: umulh x12, x10, x0 -; AARCH-NEXT: umulh x13, x2, x9 -; AARCH-NEXT: madd x12, x10, x1, x12 -; AARCH-NEXT: add x13, x13, x11 -; AARCH-NEXT: mul x10, x10, x0 -; AARCH-NEXT: madd x9, x3, x9, x13 -; AARCH-NEXT: add x12, x12, x10 ; AARCH-NEXT: adds x10, x10, x11 ; AARCH-NEXT: mul x11, x1, x2 ; AARCH-NEXT: adc x9, x12, x9 ; AARCH-NEXT: umulh x13, x1, x2 +; AARCH-NEXT: mov x8, x1 ; AARCH-NEXT: mul x12, x0, x3 ; AARCH-NEXT: adds x11, x11, x14 ; AARCH-NEXT: umulh x14, x0, x3 @@ -73,6 +76,7 @@ ; AARCH-NEXT: eor x9, x9, x11 ; AARCH-NEXT: eor x10, x10, x11 ; AARCH-NEXT: orr x9, x10, x9 +; AARCH-NEXT: str wzr, [x4] ; AARCH-NEXT: cmp x9, #0 ; AARCH-NEXT: cset w9, ne ; AARCH-NEXT: tbz x8, #63, .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -159,24 +159,28 @@ ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0 -; CI-NEXT: v_ashrrev_i32_e32 v13, 31, v0 +; CI-NEXT: v_ashrrev_i32_e32 v11, 31, v0 ; CI-NEXT: v_mov_b32_e32 v8, 0 -; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v1, v[7:8] -; CI-NEXT: v_ashrrev_i32_e32 v14, 31, v1 -; CI-NEXT: v_mad_i64_i32 v[11:12], s[4:5], v1, v13, 0 -; CI-NEXT: v_mov_b32_e32 v7, v10 +; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v1, v[7:8] +; CI-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CI-NEXT: v_and_b32_e32 v14, v11, v1 +; CI-NEXT: v_mov_b32_e32 v1, v10 ; CI-NEXT: v_mov_b32_e32 v10, v8 -; CI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10] -; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12] -; CI-NEXT: v_add_i32_e32 v9, vcc, v7, v9 -; CI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc -; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10] -; CI-NEXT: v_add_i32_e32 v7, vcc, v9, v0 -; CI-NEXT: v_addc_u32_e32 v9, vcc, v10, v1, vcc -; CI-NEXT: v_mov_b32_e32 v1, v8 +; CI-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v12, v[9:10] +; CI-NEXT: v_and_b32_e32 v13, v11, v12 +; CI-NEXT: v_sub_i32_e32 v9, vcc, 0, v14 +; CI-NEXT: v_subb_u32_e32 v10, vcc, 0, v13, vcc +; CI-NEXT: v_mad_i64_i32 v[9:10], s[4:5], v12, v0, v[9:10] +; CI-NEXT: v_mov_b32_e32 v0, v8 +; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc +; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v12, v[0:1] +; CI-NEXT: v_add_i32_e32 v8, vcc, v0, v9 +; CI-NEXT: v_addc_u32_e32 v9, vcc, v1, v10, vcc +; CI-NEXT: v_mov_b32_e32 v1, v7 ; CI-NEXT: v_add_i32_e32 v0, vcc, v6, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc +; CI-NEXT: v_addc_u32_e32 v2, vcc, v8, v4, vcc ; CI-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -184,60 +188,64 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v0 -; SI-NEXT: v_mul_lo_u32 v11, v6, v1 -; SI-NEXT: v_mul_hi_u32 v12, v0, v1 ; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; SI-NEXT: v_mul_hi_u32 v14, v6, v1 -; SI-NEXT: v_mul_lo_u32 v13, v0, v7 -; SI-NEXT: v_mul_hi_u32 v10, v0, v7 -; SI-NEXT: v_add_i32_e32 v12, vcc, v11, v12 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_mul_hi_u32 v8, v6, v7 -; SI-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_mul_i32_i24_e32 v9, v6, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; SI-NEXT: v_mul_hi_i32 v6, v1, v6 -; SI-NEXT: v_mul_hi_i32 v7, v7, v0 -; SI-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v11 +; SI-NEXT: v_and_b32_e32 v9, v7, v0 +; SI-NEXT: v_and_b32_e32 v10, v6, v1 +; SI-NEXT: v_mul_lo_u32 v13, v6, v1 +; SI-NEXT: v_mul_hi_u32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v8, v7, v6 +; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; SI-NEXT: v_mul_hi_u32 v10, v6, v7 +; SI-NEXT: v_mul_i32_i24_e32 v11, v6, v7 +; SI-NEXT: v_mul_hi_u32 v6, v6, v1 +; SI-NEXT: v_mul_hi_u32 v12, v0, v7 +; SI-NEXT: v_mul_lo_u32 v7, v0, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v8, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; SI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; SI-NEXT: v_mul_lo_u32 v0, v0, v1 -; SI-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, v9, v10 -; SI-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc +; SI-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc +; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 +; SI-NEXT: v_subb_u32_e32 v8, vcc, v10, v8, vcc ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; SI-NEXT: v_addc_u32_e32 v1, vcc, v12, v3, vcc -; SI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc -; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; SI-NEXT: v_addc_u32_e32 v1, vcc, v7, v3, vcc +; SI-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; SI-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: mad_i64_i32_sextops_i32_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9] -; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v11 -; GFX9-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9] -; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0 -; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13] -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v10 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v0 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v1, 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v1 +; GFX9-NEXT: v_and_b32_e32 v6, v14, v1 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-NEXT: v_and_b32_e32 v7, v14, v15 +; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v6 +; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v1, v[10:11] +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-NEXT: v_mad_i64_i32 v[6:7], s[4:5], v15, v0, v[6:7] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v15, v[12:13] +; GFX9-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v15, v[10:11] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v5, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: mad_i64_i32_sextops_i32_i128: @@ -246,27 +254,30 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0 -; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1 +; GFX11-NEXT: v_ashrrev_i32_e32 v16, 31, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v17, 31, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v16, v1, v[7:8] ; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10] -; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0 -; GFX11-NEXT: v_mov_b32_e32 v8, v12 +; GFX11-NEXT: v_and_b32_e32 v8, v16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v17, v[9:10] +; GFX11-NEXT: v_and_b32_e32 v9, v16, v17 +; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, 0, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v1, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10] -; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8] +; GFX11-NEXT: v_mad_i64_i32 v[14:15], null, v17, v0, v[8:9] +; GFX11-NEXT: v_add_co_u32 v12, s0, v7, v1 ; GFX11-NEXT: v_mov_b32_e32 v7, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12 -; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v17, v[12:13] +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v15, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll --- a/llvm/test/CodeGen/PowerPC/pr45448.ll +++ b/llvm/test/CodeGen/PowerPC/pr45448.ll @@ -25,7 +25,8 @@ ; CHECK-NEXT: rldic r5, r5, 4, 32 ; CHECK-NEXT: crnot 4*cr5+lt, eq ; CHECK-NEXT: mulhdu r3, r3, r5 -; CHECK-NEXT: maddld r6, r4, r5, r3 +; CHECK-NEXT: and r6, r4, r5 +; CHECK-NEXT: sub r6, r3, r6 ; CHECK-NEXT: cmpld cr1, r6, r3 ; CHECK-NEXT: mulhdu. r3, r4, r5 ; CHECK-NEXT: bc 4, 4*cr5+lt, .LBB0_10 diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1480,18 +1480,18 @@ ; RV32IM-NEXT: add a5, a6, a2 ; RV32IM-NEXT: mul a7, a1, a3 ; RV32IM-NEXT: add t0, a7, a5 -; RV32IM-NEXT: mul t1, a4, a0 -; RV32IM-NEXT: add a2, t0, t1 +; RV32IM-NEXT: and t1, a4, a0 +; RV32IM-NEXT: sub a2, t0, t1 ; RV32IM-NEXT: sltu t2, a2, t0 ; RV32IM-NEXT: sltu a7, t0, a7 ; RV32IM-NEXT: sltu a5, a5, a6 ; RV32IM-NEXT: mulhu a3, a1, a3 ; RV32IM-NEXT: add a3, a3, a5 ; RV32IM-NEXT: add a3, a3, a7 -; RV32IM-NEXT: mul a1, a4, a1 +; RV32IM-NEXT: and a1, a4, a1 ; RV32IM-NEXT: mulhu a0, a4, a0 -; RV32IM-NEXT: add a0, a0, a1 -; RV32IM-NEXT: add a0, a0, t1 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: sub a0, a0, t1 ; RV32IM-NEXT: add a0, a3, a0 ; RV32IM-NEXT: add a1, a0, t2 ; RV32IM-NEXT: mv a0, a2 diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -961,8 +961,10 @@ ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: .cfi_offset s1, -8 +; RV32-NEXT: .cfi_offset s2, -12 ; RV32-NEXT: mulhu a5, a0, a2 ; RV32-NEXT: mul a6, a1, a2 ; RV32-NEXT: add a5, a6, a5 @@ -978,33 +980,34 @@ ; RV32-NEXT: mul t0, a1, a3 ; RV32-NEXT: add t1, t0, a7 ; RV32-NEXT: srai t2, a1, 31 -; RV32-NEXT: mul t3, a2, t2 +; RV32-NEXT: and t3, t2, a2 ; RV32-NEXT: srai t4, a3, 31 -; RV32-NEXT: mul t5, t4, a0 -; RV32-NEXT: add t6, t5, t3 -; RV32-NEXT: add s0, t1, t6 -; RV32-NEXT: sltu s1, s0, t1 +; RV32-NEXT: and t5, t4, a0 +; RV32-NEXT: neg t6, t5 +; RV32-NEXT: sub s0, t6, t3 +; RV32-NEXT: add s1, t1, s0 +; RV32-NEXT: sltu s2, s1, t1 ; RV32-NEXT: sltu t0, t1, t0 ; RV32-NEXT: sltu a6, a7, a6 ; RV32-NEXT: mulhu a7, a1, a3 ; RV32-NEXT: add a6, a7, a6 ; RV32-NEXT: add a6, a6, t0 ; RV32-NEXT: mulhu a7, a2, t2 -; RV32-NEXT: add a7, a7, t3 -; RV32-NEXT: mul a3, a3, t2 -; RV32-NEXT: add a3, a7, a3 -; RV32-NEXT: mul a1, t4, a1 +; RV32-NEXT: sub a7, a7, t3 +; RV32-NEXT: and a3, t2, a3 +; RV32-NEXT: sub a3, a7, a3 +; RV32-NEXT: and a1, t4, a1 ; RV32-NEXT: mulhu a7, t4, a0 -; RV32-NEXT: add a1, a7, a1 -; RV32-NEXT: add a1, a1, t5 +; RV32-NEXT: sub a1, a7, a1 +; RV32-NEXT: sub a1, a1, t5 ; RV32-NEXT: add a1, a1, a3 -; RV32-NEXT: sltu a3, t6, t5 +; RV32-NEXT: sltu a3, s0, t6 ; RV32-NEXT: add a1, a1, a3 ; RV32-NEXT: add a1, a6, a1 -; RV32-NEXT: add a1, a1, s1 +; RV32-NEXT: add a1, a1, s2 ; RV32-NEXT: srai a3, a5, 31 ; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: xor a3, s0, a3 +; RV32-NEXT: xor a3, s1, a3 ; RV32-NEXT: or a1, a3, a1 ; RV32-NEXT: snez a1, a1 ; RV32-NEXT: mul a0, a0, a2 @@ -1013,6 +1016,7 @@ ; RV32-NEXT: mv a0, a1 ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -1032,8 +1036,10 @@ ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s2, 4(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 ; RV32ZBA-NEXT: .cfi_offset s1, -8 +; RV32ZBA-NEXT: .cfi_offset s2, -12 ; RV32ZBA-NEXT: mulhu a5, a0, a2 ; RV32ZBA-NEXT: mul a6, a1, a2 ; RV32ZBA-NEXT: add a5, a6, a5 @@ -1049,33 +1055,34 @@ ; RV32ZBA-NEXT: mul t0, a1, a3 ; RV32ZBA-NEXT: add t1, t0, a7 ; RV32ZBA-NEXT: srai t2, a1, 31 -; RV32ZBA-NEXT: mul t3, a2, t2 +; RV32ZBA-NEXT: and t3, t2, a2 ; RV32ZBA-NEXT: srai t4, a3, 31 -; RV32ZBA-NEXT: mul t5, t4, a0 -; RV32ZBA-NEXT: add t6, t5, t3 -; RV32ZBA-NEXT: add s0, t1, t6 -; RV32ZBA-NEXT: sltu s1, s0, t1 +; RV32ZBA-NEXT: and t5, t4, a0 +; RV32ZBA-NEXT: neg t6, t5 +; RV32ZBA-NEXT: sub s0, t6, t3 +; RV32ZBA-NEXT: add s1, t1, s0 +; RV32ZBA-NEXT: sltu s2, s1, t1 ; RV32ZBA-NEXT: sltu t0, t1, t0 ; RV32ZBA-NEXT: sltu a6, a7, a6 ; RV32ZBA-NEXT: mulhu a7, a1, a3 ; RV32ZBA-NEXT: add a6, a7, a6 ; RV32ZBA-NEXT: add a6, a6, t0 ; RV32ZBA-NEXT: mulhu a7, a2, t2 -; RV32ZBA-NEXT: add a7, a7, t3 -; RV32ZBA-NEXT: mul a3, a3, t2 -; RV32ZBA-NEXT: add a3, a7, a3 -; RV32ZBA-NEXT: mul a1, t4, a1 +; RV32ZBA-NEXT: sub a7, a7, t3 +; RV32ZBA-NEXT: and a3, t2, a3 +; RV32ZBA-NEXT: sub a3, a7, a3 +; RV32ZBA-NEXT: and a1, t4, a1 ; RV32ZBA-NEXT: mulhu a7, t4, a0 -; RV32ZBA-NEXT: add a1, a7, a1 -; RV32ZBA-NEXT: add a1, a1, t5 +; RV32ZBA-NEXT: sub a1, a7, a1 +; RV32ZBA-NEXT: sub a1, a1, t5 ; RV32ZBA-NEXT: add a1, a1, a3 -; RV32ZBA-NEXT: sltu a3, t6, t5 +; RV32ZBA-NEXT: sltu a3, s0, t6 ; RV32ZBA-NEXT: add a1, a1, a3 ; RV32ZBA-NEXT: add a1, a6, a1 -; RV32ZBA-NEXT: add a1, a1, s1 +; RV32ZBA-NEXT: add a1, a1, s2 ; RV32ZBA-NEXT: srai a3, a5, 31 ; RV32ZBA-NEXT: xor a1, a1, a3 -; RV32ZBA-NEXT: xor a3, s0, a3 +; RV32ZBA-NEXT: xor a3, s1, a3 ; RV32ZBA-NEXT: or a1, a3, a1 ; RV32ZBA-NEXT: snez a1, a1 ; RV32ZBA-NEXT: mul a0, a0, a2 @@ -1084,6 +1091,7 @@ ; RV32ZBA-NEXT: mv a0, a1 ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s2, 4(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -1115,8 +1123,8 @@ ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: srai a1, a1, 31 -; RV32-NEXT: mul a6, a1, a3 -; RV32-NEXT: add a6, a5, a6 +; RV32-NEXT: andi a6, a1, 13 +; RV32-NEXT: sub a6, a5, a6 ; RV32-NEXT: srai a7, a4, 31 ; RV32-NEXT: xor t0, a6, a7 ; RV32-NEXT: sltu a5, a6, a5 @@ -1152,8 +1160,8 @@ ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: srai a1, a1, 31 -; RV32ZBA-NEXT: mul a6, a1, a3 -; RV32ZBA-NEXT: add a6, a5, a6 +; RV32ZBA-NEXT: andi a6, a1, 13 +; RV32ZBA-NEXT: sub a6, a5, a6 ; RV32ZBA-NEXT: srai a7, a4, 31 ; RV32ZBA-NEXT: xor t0, a6, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 @@ -2352,7 +2360,9 @@ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 ; RV32-NEXT: add a4, a5, a4 @@ -2368,33 +2378,34 @@ ; RV32-NEXT: mul a7, a1, a3 ; RV32-NEXT: add t0, a7, a6 ; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: mul t2, a2, t1 +; RV32-NEXT: and t2, t1, a2 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: mul t4, t3, a0 -; RV32-NEXT: add t5, t4, t2 -; RV32-NEXT: add t6, t0, t5 -; RV32-NEXT: sltu s0, t6, t0 +; RV32-NEXT: and t4, t3, a0 +; RV32-NEXT: neg t5, t4 +; RV32-NEXT: sub t6, t5, t2 +; RV32-NEXT: add s0, t0, t6 +; RV32-NEXT: sltu s1, s0, t0 ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a6, a2, t1 -; RV32-NEXT: add a6, a6, t2 -; RV32-NEXT: mul a7, a3, t1 -; RV32-NEXT: add a6, a6, a7 -; RV32-NEXT: mul a7, t3, a1 +; RV32-NEXT: sub a6, a6, t2 +; RV32-NEXT: and a7, t1, a3 +; RV32-NEXT: sub a6, a6, a7 +; RV32-NEXT: and a7, t3, a1 ; RV32-NEXT: mulhu t0, t3, a0 -; RV32-NEXT: add a7, t0, a7 -; RV32-NEXT: add a7, a7, t4 +; RV32-NEXT: sub a7, t0, a7 +; RV32-NEXT: sub a7, a7, t4 ; RV32-NEXT: add a6, a7, a6 -; RV32-NEXT: sltu a7, t5, t4 +; RV32-NEXT: sltu a7, t6, t5 ; RV32-NEXT: add a6, a6, a7 ; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: add a5, a5, s0 +; RV32-NEXT: add a5, a5, s1 ; RV32-NEXT: srai a4, a4, 31 ; RV32-NEXT: xor a5, a5, a4 -; RV32-NEXT: xor a4, t6, a4 +; RV32-NEXT: xor a4, s0, a4 ; RV32-NEXT: or a4, a4, a5 ; RV32-NEXT: bnez a4, .LBB46_2 ; RV32-NEXT: # %bb.1: # %entry @@ -2402,6 +2413,7 @@ ; RV32-NEXT: mv a1, a3 ; RV32-NEXT: .LBB46_2: # %entry ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -2421,7 +2433,9 @@ ; RV32ZBA-NEXT: addi sp, sp, -16 ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 +; RV32ZBA-NEXT: .cfi_offset s1, -8 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 ; RV32ZBA-NEXT: add a4, a5, a4 @@ -2437,33 +2451,34 @@ ; RV32ZBA-NEXT: mul a7, a1, a3 ; RV32ZBA-NEXT: add t0, a7, a6 ; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: mul t2, a2, t1 +; RV32ZBA-NEXT: and t2, t1, a2 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: mul t4, t3, a0 -; RV32ZBA-NEXT: add t5, t4, t2 -; RV32ZBA-NEXT: add t6, t0, t5 -; RV32ZBA-NEXT: sltu s0, t6, t0 +; RV32ZBA-NEXT: and t4, t3, a0 +; RV32ZBA-NEXT: neg t5, t4 +; RV32ZBA-NEXT: sub t6, t5, t2 +; RV32ZBA-NEXT: add s0, t0, t6 +; RV32ZBA-NEXT: sltu s1, s0, t0 ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a6, a2, t1 -; RV32ZBA-NEXT: add a6, a6, t2 -; RV32ZBA-NEXT: mul a7, a3, t1 -; RV32ZBA-NEXT: add a6, a6, a7 -; RV32ZBA-NEXT: mul a7, t3, a1 +; RV32ZBA-NEXT: sub a6, a6, t2 +; RV32ZBA-NEXT: and a7, t1, a3 +; RV32ZBA-NEXT: sub a6, a6, a7 +; RV32ZBA-NEXT: and a7, t3, a1 ; RV32ZBA-NEXT: mulhu t0, t3, a0 -; RV32ZBA-NEXT: add a7, t0, a7 -; RV32ZBA-NEXT: add a7, a7, t4 +; RV32ZBA-NEXT: sub a7, t0, a7 +; RV32ZBA-NEXT: sub a7, a7, t4 ; RV32ZBA-NEXT: add a6, a7, a6 -; RV32ZBA-NEXT: sltu a7, t5, t4 +; RV32ZBA-NEXT: sltu a7, t6, t5 ; RV32ZBA-NEXT: add a6, a6, a7 ; RV32ZBA-NEXT: add a5, a5, a6 -; RV32ZBA-NEXT: add a5, a5, s0 +; RV32ZBA-NEXT: add a5, a5, s1 ; RV32ZBA-NEXT: srai a4, a4, 31 ; RV32ZBA-NEXT: xor a5, a5, a4 -; RV32ZBA-NEXT: xor a4, t6, a4 +; RV32ZBA-NEXT: xor a4, s0, a4 ; RV32ZBA-NEXT: or a4, a4, a5 ; RV32ZBA-NEXT: bnez a4, .LBB46_2 ; RV32ZBA-NEXT: # %bb.1: # %entry @@ -2471,6 +2486,7 @@ ; RV32ZBA-NEXT: mv a1, a3 ; RV32ZBA-NEXT: .LBB46_2: # %entry ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -2497,7 +2513,9 @@ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 ; RV32-NEXT: add a4, a5, a4 @@ -2513,36 +2531,38 @@ ; RV32-NEXT: mul a7, a1, a3 ; RV32-NEXT: add t0, a7, a6 ; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: mul t2, a2, t1 +; RV32-NEXT: and t2, t1, a2 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: mul t4, t3, a0 -; RV32-NEXT: add t5, t4, t2 -; RV32-NEXT: add t6, t0, t5 -; RV32-NEXT: sltu s0, t6, t0 +; RV32-NEXT: and t4, t3, a0 +; RV32-NEXT: neg t5, t4 +; RV32-NEXT: sub t6, t5, t2 +; RV32-NEXT: add s0, t0, t6 +; RV32-NEXT: sltu s1, s0, t0 ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a2, a2, t1 -; RV32-NEXT: add a2, a2, t2 -; RV32-NEXT: mul a3, a3, t1 -; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: mul a1, t3, a1 +; RV32-NEXT: sub a2, a2, t2 +; RV32-NEXT: and a3, t1, a3 +; RV32-NEXT: sub a2, a2, a3 +; RV32-NEXT: and a1, t3, a1 ; RV32-NEXT: mulhu a0, t3, a0 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a0, t4 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: sub a0, a0, t4 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: sltu a1, t5, t4 +; RV32-NEXT: sltu a1, t6, t5 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add a0, a5, a0 -; RV32-NEXT: add a0, a0, s0 +; RV32-NEXT: add a0, a0, s1 ; RV32-NEXT: srai a1, a4, 31 ; RV32-NEXT: xor a0, a0, a1 -; RV32-NEXT: xor a1, t6, a1 +; RV32-NEXT: xor a1, s0, a1 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -2560,7 +2580,9 @@ ; RV32ZBA-NEXT: addi sp, sp, -16 ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 +; RV32ZBA-NEXT: .cfi_offset s1, -8 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 ; RV32ZBA-NEXT: add a4, a5, a4 @@ -2576,36 +2598,38 @@ ; RV32ZBA-NEXT: mul a7, a1, a3 ; RV32ZBA-NEXT: add t0, a7, a6 ; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: mul t2, a2, t1 +; RV32ZBA-NEXT: and t2, t1, a2 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: mul t4, t3, a0 -; RV32ZBA-NEXT: add t5, t4, t2 -; RV32ZBA-NEXT: add t6, t0, t5 -; RV32ZBA-NEXT: sltu s0, t6, t0 +; RV32ZBA-NEXT: and t4, t3, a0 +; RV32ZBA-NEXT: neg t5, t4 +; RV32ZBA-NEXT: sub t6, t5, t2 +; RV32ZBA-NEXT: add s0, t0, t6 +; RV32ZBA-NEXT: sltu s1, s0, t0 ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a2, a2, t1 -; RV32ZBA-NEXT: add a2, a2, t2 -; RV32ZBA-NEXT: mul a3, a3, t1 -; RV32ZBA-NEXT: add a2, a2, a3 -; RV32ZBA-NEXT: mul a1, t3, a1 +; RV32ZBA-NEXT: sub a2, a2, t2 +; RV32ZBA-NEXT: and a3, t1, a3 +; RV32ZBA-NEXT: sub a2, a2, a3 +; RV32ZBA-NEXT: and a1, t3, a1 ; RV32ZBA-NEXT: mulhu a0, t3, a0 -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: add a0, a0, t4 +; RV32ZBA-NEXT: sub a0, a0, a1 +; RV32ZBA-NEXT: sub a0, a0, t4 ; RV32ZBA-NEXT: add a0, a0, a2 -; RV32ZBA-NEXT: sltu a1, t5, t4 +; RV32ZBA-NEXT: sltu a1, t6, t5 ; RV32ZBA-NEXT: add a0, a0, a1 ; RV32ZBA-NEXT: add a0, a5, a0 -; RV32ZBA-NEXT: add a0, a0, s0 +; RV32ZBA-NEXT: add a0, a0, s1 ; RV32ZBA-NEXT: srai a1, a4, 31 ; RV32ZBA-NEXT: xor a0, a0, a1 -; RV32ZBA-NEXT: xor a1, t6, a1 +; RV32ZBA-NEXT: xor a1, s0, a1 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: seqz a0, a0 ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -3453,7 +3477,9 @@ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 ; RV32-NEXT: add a4, a5, a4 @@ -3469,33 +3495,34 @@ ; RV32-NEXT: mul a7, a1, a3 ; RV32-NEXT: add t0, a7, a6 ; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: mul t2, a2, t1 +; RV32-NEXT: and t2, t1, a2 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: mul t4, t3, a0 -; RV32-NEXT: add t5, t4, t2 -; RV32-NEXT: add t6, t0, t5 -; RV32-NEXT: sltu s0, t6, t0 +; RV32-NEXT: and t4, t3, a0 +; RV32-NEXT: neg t5, t4 +; RV32-NEXT: sub t6, t5, t2 +; RV32-NEXT: add s0, t0, t6 +; RV32-NEXT: sltu s1, s0, t0 ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a2, a2, t1 -; RV32-NEXT: add a2, a2, t2 -; RV32-NEXT: mul a3, a3, t1 -; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: mul a1, t3, a1 +; RV32-NEXT: sub a2, a2, t2 +; RV32-NEXT: and a3, t1, a3 +; RV32-NEXT: sub a2, a2, a3 +; RV32-NEXT: and a1, t3, a1 ; RV32-NEXT: mulhu a0, t3, a0 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a0, t4 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: sub a0, a0, t4 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: sltu a1, t5, t4 +; RV32-NEXT: sltu a1, t6, t5 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add a0, a5, a0 -; RV32-NEXT: add a0, a0, s0 +; RV32-NEXT: add a0, a0, s1 ; RV32-NEXT: srai a1, a4, 31 ; RV32-NEXT: xor a0, a0, a1 -; RV32-NEXT: xor a1, t6, a1 +; RV32-NEXT: xor a1, s0, a1 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: beqz a0, .LBB61_2 ; RV32-NEXT: # %bb.1: # %overflow @@ -3505,6 +3532,7 @@ ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB61_3: # %overflow ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -3526,7 +3554,9 @@ ; RV32ZBA-NEXT: addi sp, sp, -16 ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 +; RV32ZBA-NEXT: .cfi_offset s1, -8 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 ; RV32ZBA-NEXT: add a4, a5, a4 @@ -3542,33 +3572,34 @@ ; RV32ZBA-NEXT: mul a7, a1, a3 ; RV32ZBA-NEXT: add t0, a7, a6 ; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: mul t2, a2, t1 +; RV32ZBA-NEXT: and t2, t1, a2 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: mul t4, t3, a0 -; RV32ZBA-NEXT: add t5, t4, t2 -; RV32ZBA-NEXT: add t6, t0, t5 -; RV32ZBA-NEXT: sltu s0, t6, t0 +; RV32ZBA-NEXT: and t4, t3, a0 +; RV32ZBA-NEXT: neg t5, t4 +; RV32ZBA-NEXT: sub t6, t5, t2 +; RV32ZBA-NEXT: add s0, t0, t6 +; RV32ZBA-NEXT: sltu s1, s0, t0 ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a2, a2, t1 -; RV32ZBA-NEXT: add a2, a2, t2 -; RV32ZBA-NEXT: mul a3, a3, t1 -; RV32ZBA-NEXT: add a2, a2, a3 -; RV32ZBA-NEXT: mul a1, t3, a1 +; RV32ZBA-NEXT: sub a2, a2, t2 +; RV32ZBA-NEXT: and a3, t1, a3 +; RV32ZBA-NEXT: sub a2, a2, a3 +; RV32ZBA-NEXT: and a1, t3, a1 ; RV32ZBA-NEXT: mulhu a0, t3, a0 -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: add a0, a0, t4 +; RV32ZBA-NEXT: sub a0, a0, a1 +; RV32ZBA-NEXT: sub a0, a0, t4 ; RV32ZBA-NEXT: add a0, a0, a2 -; RV32ZBA-NEXT: sltu a1, t5, t4 +; RV32ZBA-NEXT: sltu a1, t6, t5 ; RV32ZBA-NEXT: add a0, a0, a1 ; RV32ZBA-NEXT: add a0, a5, a0 -; RV32ZBA-NEXT: add a0, a0, s0 +; RV32ZBA-NEXT: add a0, a0, s1 ; RV32ZBA-NEXT: srai a1, a4, 31 ; RV32ZBA-NEXT: xor a0, a0, a1 -; RV32ZBA-NEXT: xor a1, t6, a1 +; RV32ZBA-NEXT: xor a1, s0, a1 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: beqz a0, .LBB61_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow @@ -3578,6 +3609,7 @@ ; RV32ZBA-NEXT: li a0, 1 ; RV32ZBA-NEXT: .LBB61_3: # %overflow ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -3625,8 +3657,8 @@ ; RV32-NEXT: add a6, a4, a6 ; RV32-NEXT: sub t1, a6, a1 ; RV32-NEXT: srai t2, a1, 31 -; RV32-NEXT: mul t3, t2, a2 -; RV32-NEXT: sub t3, t3, a0 +; RV32-NEXT: andi t3, t2, -13 +; RV32-NEXT: sub t3, a5, t3 ; RV32-NEXT: add t4, t1, t3 ; RV32-NEXT: sltu t5, t4, t1 ; RV32-NEXT: neg t6, a1 @@ -3687,8 +3719,8 @@ ; RV32ZBA-NEXT: add a6, a4, a6 ; RV32ZBA-NEXT: sub t1, a6, a1 ; RV32ZBA-NEXT: srai t2, a1, 31 -; RV32ZBA-NEXT: mul t3, t2, a2 -; RV32ZBA-NEXT: sub t3, t3, a0 +; RV32ZBA-NEXT: andi t3, t2, -13 +; RV32ZBA-NEXT: sub t3, a5, t3 ; RV32ZBA-NEXT: add t4, t1, t3 ; RV32ZBA-NEXT: sltu t5, t4, t1 ; RV32ZBA-NEXT: neg t6, a1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -38,22 +38,23 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_0246_ext0(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0246_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull lr, r12, r1, r0 -; CHECK-NEXT: umull r2, r5, r3, r0 +; CHECK-NEXT: umull r2, r4, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: mla r4, r1, r2, r12 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r2, r3, r2, r5 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: mla r1, r1, r0, r4 -; CHECK-NEXT: mla r0, r3, r0, r2 +; CHECK-NEXT: and.w r2, r1, r0, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r0, r1, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r3, r0, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r0, r0, r3, asr #31 +; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -67,22 +68,23 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_0246(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_ext0_0246: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: asrs r4, r0, #31 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull lr, r12, r0, r1 -; CHECK-NEXT: umull r2, r5, r0, r3 +; CHECK-NEXT: umull r2, r4, r0, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: mla r2, r0, r2, r12 -; CHECK-NEXT: mla r1, r4, r1, r2 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: mla r0, r0, r2, r5 -; CHECK-NEXT: mla r0, r4, r3, r0 +; CHECK-NEXT: and.w r2, r0, r1, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r1, r0, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r0, r3, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r0, r3, r0, asr #31 +; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -130,23 +132,24 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_1357_ext0(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_1357_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vrev64.32 q1, q0 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r1, r0 -; CHECK-NEXT: umull r2, r5, r3, r0 +; CHECK-NEXT: umull r2, r4, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: mla r4, r1, r2, r12 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r2, r3, r2, r5 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: mla r1, r1, r0, r4 -; CHECK-NEXT: mla r0, r3, r0, r2 +; CHECK-NEXT: and.w r2, r1, r0, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r0, r1, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r3, r0, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r0, r0, r3, asr #31 +; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -160,23 +163,24 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_1357(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_ext0_1357: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vrev64.32 q1, q0 -; CHECK-NEXT: asrs r4, r0, #31 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r0, r1 -; CHECK-NEXT: umull r2, r5, r0, r3 +; CHECK-NEXT: umull r2, r4, r0, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: mla r2, r0, r2, r12 -; CHECK-NEXT: mla r1, r4, r1, r2 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: mla r0, r0, r2, r5 -; CHECK-NEXT: mla r0, r4, r3, r0 +; CHECK-NEXT: and.w r2, r0, r1, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r1, r0, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r0, r3, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r0, r3, r0, asr #31 +; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -230,36 +234,39 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0213_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r2, r5, r3, r0 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r1, r0 +; CHECK-NEXT: umull r2, r4, r3, r0 ; CHECK-NEXT: vmov q1[2], q1[0], r2, lr -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: mla r4, r1, r2, r12 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r5, r3, r2, r5 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: mla r1, r1, r0, r4 -; CHECK-NEXT: mla r3, r3, r0, r5 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r1 +; CHECK-NEXT: and.w r2, r1, r0, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r0, r1, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r3, r0, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r3, r0, r3, asr #31 +; CHECK-NEXT: subs r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: umull r3, r5, r1, r0 -; CHECK-NEXT: mla r5, r1, r2, r5 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r12, r1, r0, r5 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: umull r4, r1, r5, r0 -; CHECK-NEXT: mla r1, r5, r2, r1 -; CHECK-NEXT: asrs r2, r5, #31 +; CHECK-NEXT: and.w r2, r1, r0, asr #31 +; CHECK-NEXT: umull r3, r4, r1, r0 +; CHECK-NEXT: and.w r1, r0, r1, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sub.w r12, r2, r1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: umull r4, r1, r2, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: mla r0, r2, r0, r1 +; CHECK-NEXT: and.w r3, r2, r0, asr #31 +; CHECK-NEXT: and.w r0, r0, r2, asr #31 +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: subs r0, r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> %out1 = sext <4 x i32> %shuf1 to <4 x i64> @@ -273,36 +280,39 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_ext0_0213: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: asrs r4, r0, #31 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r2, r5, r0, r3 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r0, r1 +; CHECK-NEXT: umull r2, r4, r0, r3 ; CHECK-NEXT: vmov q1[2], q1[0], r2, lr -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: mla r2, r0, r2, r12 -; CHECK-NEXT: mla r1, r4, r1, r2 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: mla r2, r0, r2, r5 -; CHECK-NEXT: mla r2, r4, r3, r2 +; CHECK-NEXT: and.w r2, r0, r1, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r1, r0, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r0, r3, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r3, r3, r0, asr #31 +; CHECK-NEXT: subs r2, r2, r3 ; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: umull r2, r3, r0, r1 -; CHECK-NEXT: asrs r5, r1, #31 -; CHECK-NEXT: mla r3, r0, r5, r3 -; CHECK-NEXT: mla r12, r4, r1, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r5, r1, r0, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: mla r0, r0, r2, r1 -; CHECK-NEXT: mla r0, r4, r3, r0 +; CHECK-NEXT: umull r3, r4, r0, r1 +; CHECK-NEXT: and.w r2, r0, r1, asr #31 +; CHECK-NEXT: and.w r1, r1, r0, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sub.w r12, r2, r1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: umull r4, r1, r0, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: and.w r3, r0, r2, asr #31 +; CHECK-NEXT: and.w r0, r2, r0, asr #31 +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: subs r0, r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> %out1 = sext <4 x i32> %shuf1 to <4 x i64> diff --git a/llvm/test/CodeGen/X86/extmul128.ll b/llvm/test/CodeGen/X86/extmul128.ll --- a/llvm/test/CodeGen/X86/extmul128.ll +++ b/llvm/test/CodeGen/X86/extmul128.ll @@ -29,8 +29,8 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: mulq %rsi ; CHECK-NEXT: sarq $63, %rsi -; CHECK-NEXT: imulq %rdi, %rsi -; CHECK-NEXT: addq %rsi, %rdx +; CHECK-NEXT: andq %rdi, %rsi +; CHECK-NEXT: subq %rsi, %rdx ; CHECK-NEXT: retq %aa = zext i64 %a to i128 %bb = sext i64 %b to i128 @@ -45,6 +45,37 @@ ; CHECK-NEXT: movq %rdi, %rcx ; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: mulq %rsi +; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: subq %rcx, %rdx +; CHECK-NEXT: retq + %aa = sext i64 %a to i128 + %bb = zext i64 %b to i128 + %cc = mul i128 %aa, %bb + ret i128 %cc +} + +define i128 @i64_zext_sext_i128_minsize(i64 %a, i64 %b) minsize { +; CHECK-LABEL: i64_zext_sext_i128_minsize: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: mulq %rsi +; CHECK-NEXT: sarq $63, %rsi +; CHECK-NEXT: imulq %rdi, %rsi +; CHECK-NEXT: addq %rsi, %rdx +; CHECK-NEXT: retq + %aa = zext i64 %a to i128 + %bb = sext i64 %b to i128 + %cc = mul i128 %aa, %bb + ret i128 %cc +} + +define i128 @i64_sext_zext_i128_minsize(i64 %a, i64 %b) minsize { +; CHECK-LABEL: i64_sext_zext_i128_minsize: +; CHECK: # %bb.0: +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rdi, %rcx +; CHECK-NEXT: sarq $63, %rcx +; CHECK-NEXT: mulq %rsi ; CHECK-NEXT: imulq %rsi, %rcx ; CHECK-NEXT: addq %rcx, %rdx ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll --- a/llvm/test/CodeGen/X86/muloti.ll +++ b/llvm/test/CodeGen/X86/muloti.ll @@ -7,34 +7,39 @@ define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp { ; CHECK-LABEL: x: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r15 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %r14, -16 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %r15, -16 ; CHECK-NEXT: movq %rdx, %r11 ; CHECK-NEXT: movq %rdi, %r9 -; CHECK-NEXT: movq %rsi, %rbx -; CHECK-NEXT: sarq $63, %rbx -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: imulq %rbx, %rdi +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: sarq $63, %rdi +; CHECK-NEXT: movq %rdi, %r10 +; CHECK-NEXT: andq %rdx, %r10 ; CHECK-NEXT: movq %rdx, %rax -; CHECK-NEXT: mulq %rbx +; CHECK-NEXT: mulq %rdi ; CHECK-NEXT: movq %rax, %r8 -; CHECK-NEXT: addq %rdi, %rdx -; CHECK-NEXT: imulq %rcx, %rbx -; CHECK-NEXT: addq %rdx, %rbx -; CHECK-NEXT: movq %rcx, %rdi -; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: movq %rdi, %r14 -; CHECK-NEXT: imulq %rsi, %r14 -; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: subq %r10, %rbx +; CHECK-NEXT: andq %rcx, %rdi +; CHECK-NEXT: subq %rdi, %rbx +; CHECK-NEXT: movq %rcx, %r14 +; CHECK-NEXT: sarq $63, %r14 +; CHECK-NEXT: movq %r14, %r15 +; CHECK-NEXT: andq %rsi, %r15 +; CHECK-NEXT: movq %r14, %rax ; CHECK-NEXT: mulq %r9 ; CHECK-NEXT: movq %rax, %r10 -; CHECK-NEXT: addq %r14, %rdx -; CHECK-NEXT: imulq %r9, %rdi -; CHECK-NEXT: addq %rdx, %rdi +; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: subq %r15, %rdi +; CHECK-NEXT: andq %r9, %r14 +; CHECK-NEXT: subq %r14, %rdi ; CHECK-NEXT: addq %r8, %r10 ; CHECK-NEXT: adcq %rbx, %rdi ; CHECK-NEXT: movq %r9, %rax @@ -72,6 +77,7 @@ ; CHECK-NEXT: movq %r9, %rdx ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 ; CHECK-NEXT: retq ; CHECK-NEXT: LBB0_1: ## %overflow ; CHECK-NEXT: ud2 diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -369,8 +369,8 @@ ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $8, %esp -; X86-NEXT: .cfi_def_cfa_offset 28 +; X86-NEXT: subl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 32 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 @@ -378,52 +378,54 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: imull %ebx, %edi -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: addl %edi, %edx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: andl %eax, %ebx +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: subl %ebx, %esi +; X86-NEXT: andl %ebp, %edi +; X86-NEXT: subl %edi, %esi ; X86-NEXT: movl %ebp, %edi -; X86-NEXT: imull %ebp, %ebx -; X86-NEXT: addl %edx, %ebx ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %edi, %ebp -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: andl %ecx, %ebp ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: imull %esi, %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %ebp, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %edx, %edi +; X86-NEXT: subl %edi, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %bl +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl %ebx, %edx ; X86-NEXT: movl %ebp, %edi ; X86-NEXT: sarl $31, %edi ; X86-NEXT: xorl %edi, %edx @@ -434,11 +436,11 @@ ; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF ; X86-NEXT: orl %edx, %edi ; X86-NEXT: notl %ecx -; X86-NEXT: cmovel (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: cmovel %ebp, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %esi, %edx -; X86-NEXT: addl $8, %esp +; X86-NEXT: addl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -9,39 +9,44 @@ ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: pushq %r14 ; X64-NEXT: .cfi_def_cfa_offset 24 -; X64-NEXT: pushq %rbx +; X64-NEXT: pushq %r12 ; X64-NEXT: .cfi_def_cfa_offset 32 -; X64-NEXT: .cfi_offset %rbx, -32 +; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 40 +; X64-NEXT: .cfi_offset %rbx, -40 +; X64-NEXT: .cfi_offset %r12, -32 ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rdi, %r10 -; X64-NEXT: movq %rsi, %r14 -; X64-NEXT: sarq $63, %r14 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: imulq %r14, %rdi +; X64-NEXT: movq %rsi, %r9 +; X64-NEXT: sarq $63, %r9 +; X64-NEXT: movq %r9, %r11 +; X64-NEXT: andq %rdx, %r11 ; X64-NEXT: movq %rdx, %rax -; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: imulq %rcx, %r14 -; X64-NEXT: addq %rdx, %r14 -; X64-NEXT: movq %rcx, %rdi -; X64-NEXT: sarq $63, %rdi -; X64-NEXT: movq %rdi, %r15 -; X64-NEXT: imulq %rsi, %r15 -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: subq %r11, %r14 +; X64-NEXT: andq %rcx, %r9 +; X64-NEXT: subq %r9, %r14 +; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: movq %r15, %r12 +; X64-NEXT: andq %rsi, %r12 +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r15, %rdx -; X64-NEXT: imulq %r10, %rdi -; X64-NEXT: addq %rdx, %rdi -; X64-NEXT: addq %r9, %r11 -; X64-NEXT: adcq %r14, %rdi +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: subq %r12, %r9 +; X64-NEXT: andq %r10, %r15 +; X64-NEXT: subq %r15, %r9 +; X64-NEXT: addq %rdi, %r11 +; X64-NEXT: adcq %r14, %r9 ; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rbx @@ -61,15 +66,16 @@ ; X64-NEXT: addq %r14, %rax ; X64-NEXT: adcq %rbx, %rdx ; X64-NEXT: addq %r11, %rax -; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: adcq %r9, %rdx ; X64-NEXT: movq %r10, 8(%r8) ; X64-NEXT: sarq $63, %r10 ; X64-NEXT: xorq %r10, %rdx ; X64-NEXT: xorq %rax, %r10 ; X64-NEXT: orq %rdx, %r10 ; X64-NEXT: setne %al -; X64-NEXT: movq %r9, (%r8) +; X64-NEXT: movq %rdi, (%r8) ; X64-NEXT: popq %rbx +; X64-NEXT: popq %r12 ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 ; X64-NEXT: retq @@ -84,8 +90,8 @@ ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $56, %esp -; X86-NEXT: .cfi_def_cfa_offset 76 +; X86-NEXT: subl $60, %esp +; X86-NEXT: .cfi_def_cfa_offset 80 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 @@ -99,226 +105,229 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi +; X86-NEXT: addl %esi, %edi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl (%esp), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: adcl %ebx, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: setb (%esp) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb (%esp) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl %edi, %esi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %esi, %ebx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: imull %esi, %ebp -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill +; X86-NEXT: subl %esi, %edi +; X86-NEXT: andl %ecx, %ebx +; X86-NEXT: subl %ebx, %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: subl %esi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: subl %eax, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl %esi, (%esp) ## 4-byte Folded Spill ; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl (%esp), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %edi +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %esi -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: imull %ebx, %edi -; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sarl $31, %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl %eax, %ebx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl %eax, %esi +; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: subl %esi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %ebx, %eax -; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: andl %edi, %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %eax, %edi -; X86-NEXT: adcl %edx, %esi -; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %eax -; X86-NEXT: xorl %edx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: xorl %ecx, %ebx +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: xorl %ecx, %ebp +; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -326,7 +335,7 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: setne %al -; X86-NEXT: addl $56, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -360,234 +369,239 @@ ; X64-NEXT: .cfi_offset %r14, -32 ; X64-NEXT: .cfi_offset %r15, -24 ; X64-NEXT: .cfi_offset %rbp, -16 -; X64-NEXT: movq %rcx, %r11 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rsi, %r15 +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rsi, %r10 +; X64-NEXT: movq %rdi, %r11 ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rsi, %r10 -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r10, %r14 -; X64-NEXT: adcq %rcx, %r12 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq %rsi, %r12 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %ecx -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movzbl %al, %edi +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r12, %rbx -; X64-NEXT: adcq %rcx, %r11 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %r8, %rcx +; X64-NEXT: movq %r9, %rcx +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r12, %rsi +; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r8, %rdi ; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %r8, %r13 ; X64-NEXT: adcq $0, %r12 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %r9, %rsi -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r12, %r10 -; X64-NEXT: setb %cl -; X64-NEXT: movq %r15, %r9 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: adcq %r12, %rdi +; X64-NEXT: setb %r9b +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r10, %r8 -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: addq %rdi, %r8 +; X64-NEXT: movzbl %r9b, %eax ; X64-NEXT: adcq %rax, %rbp -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Folded Reload -; X64-NEXT: adcq %r14, %rbp -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: adcq $0, %r11 +; X64-NEXT: adcq %rbx, %rbp +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: adcq $0, %r15 +; X64-NEXT: movq %r15, %r12 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, %rsi -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: mulq %r15 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r10, %rcx +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r13 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r10, %r9 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rdi, %r10 ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: addq %r9, %rax -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r10, %r15 ; X64-NEXT: adcq %r13, %r11 -; X64-NEXT: setb %cl -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %r11, %r13 -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r10 -; X64-NEXT: addq %r8, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %rbp, %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %rdi +; X64-NEXT: addq %r8, %rbx +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %rbp, %r15 +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload -; X64-NEXT: setb %cl -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r11 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: addq %rsi, %r13 +; X64-NEXT: adcq %r12, %rdi +; X64-NEXT: setb %r11b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rsi, %r8 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: addq %rcx, %r8 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: addq %r8, %rax -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: adcq %rdi, %r9 -; X64-NEXT: setb %r8b +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: adcq %rsi, %r10 +; X64-NEXT: setb %cl +; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r9, %r14 -; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %rbp -; X64-NEXT: addq %r13, %r11 -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r10, %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r10, %rbx ; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: adcq %rax, %rbp +; X64-NEXT: addq %r13, %r15 +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %rdi, %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movzbl %r11b, %eax +; X64-NEXT: adcq %rax, %rbx ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %rbx, %r13 -; X64-NEXT: movq %rbx, %r10 +; X64-NEXT: movq %rsi, %r13 ; X64-NEXT: sarq $63, %r13 ; X64-NEXT: movq %r13, %rcx -; X64-NEXT: imulq %r12, %rcx +; X64-NEXT: andq %r9, %rcx ; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: imulq %r13, %r15 -; X64-NEXT: addq %rdx, %r15 -; X64-NEXT: movq %r13, %rcx +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: subq %rcx, %r10 +; X64-NEXT: andq %r13, %r14 +; X64-NEXT: subq %r14, %r10 +; X64-NEXT: movq %r13, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload +; X64-NEXT: andq %r14, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload -; X64-NEXT: imulq %rdi, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: imulq %r13, %rsi -; X64-NEXT: addq %rcx, %rsi -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: subq %rsi, %rcx +; X64-NEXT: andq %r13, %rdi +; X64-NEXT: subq %rdi, %rcx +; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: addq %rax, %r8 -; X64-NEXT: adcq %r15, %rsi -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: adcq %r10, %rcx +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r9, %r15 +; X64-NEXT: addq %r11, %r15 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: addq %rcx, %r15 -; X64-NEXT: adcq %r9, %r13 -; X64-NEXT: setb %cl +; X64-NEXT: addq %rsi, %r15 +; X64-NEXT: adcq %r11, %r13 +; X64-NEXT: setb %sil ; X64-NEXT: addq %rax, %r13 -; X64-NEXT: movzbl %cl, %r9d -; X64-NEXT: adcq %rdx, %r9 +; X64-NEXT: movzbl %sil, %esi +; X64-NEXT: adcq %rdx, %rsi ; X64-NEXT: addq %r8, %r13 -; X64-NEXT: adcq %rsi, %r9 -; X64-NEXT: sarq $63, %r12 +; X64-NEXT: adcq %rcx, %rsi +; X64-NEXT: sarq $63, %r9 +; X64-NEXT: movq %r9, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: imulq %r12, %r8 -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rax, %rsi +; X64-NEXT: andq %rax, %r8 +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: addq %rdx, %r8 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: subq %r8, %r14 +; X64-NEXT: movq %r9, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: imulq %r12, %rbx -; X64-NEXT: addq %r8, %rbx +; X64-NEXT: andq %rdi, %rax +; X64-NEXT: subq %rax, %r14 +; X64-NEXT: movq %r9, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: imulq %r12, %rcx -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: imulq %r12, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: addq %rsi, %r8 -; X64-NEXT: adcq %rbx, %r10 -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: addq %r11, %rbx +; X64-NEXT: andq %rax, %r12 +; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: subq %r12, %r8 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload +; X64-NEXT: andq %r9, %rax +; X64-NEXT: subq %rax, %r8 +; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: adcq %r14, %r8 +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: addq %r11, %r14 ; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: addq %rax, %rbx +; X64-NEXT: addq %rax, %r14 ; X64-NEXT: adcq %rdx, %r11 -; X64-NEXT: setb %cl +; X64-NEXT: setb %r9b ; X64-NEXT: addq %rax, %r11 -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movzbl %r9b, %eax ; X64-NEXT: adcq %rdx, %rax -; X64-NEXT: addq %r8, %r11 -; X64-NEXT: adcq %r10, %rax -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload -; X64-NEXT: adcq %r15, %rbx +; X64-NEXT: addq %r10, %r11 +; X64-NEXT: adcq %r8, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload +; X64-NEXT: adcq %r15, %r14 ; X64-NEXT: adcq %r13, %r11 -; X64-NEXT: adcq %r9, %rax -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Folded Reload -; X64-NEXT: adcq %r14, %r11 +; X64-NEXT: adcq %rsi, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload +; X64-NEXT: adcq %rbx, %r11 ; X64-NEXT: adcq %rbp, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: sarq $63, %rcx -; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: xorq %rcx, %rbx -; X64-NEXT: orq %rax, %rbx -; X64-NEXT: xorq %rcx, %r11 -; X64-NEXT: xorq %rsi, %rcx -; X64-NEXT: orq %r11, %rcx -; X64-NEXT: orq %rbx, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: sarq $63, %rdx +; X64-NEXT: xorq %rdx, %rax +; X64-NEXT: xorq %rdx, %r14 +; X64-NEXT: orq %rax, %r14 +; X64-NEXT: xorq %rdx, %r11 +; X64-NEXT: xorq %rcx, %rdx +; X64-NEXT: orq %r11, %rdx +; X64-NEXT: orq %r14, %rdx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax -; X64-NEXT: movq %rdx, 24(%rax) +; X64-NEXT: movq %rsi, 24(%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload ; X64-NEXT: movq %rcx, (%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload @@ -613,400 +627,399 @@ ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $156, %esp -; X86-NEXT: .cfi_def_cfa_offset 176 +; X86-NEXT: subl $152, %esp +; X86-NEXT: .cfi_def_cfa_offset 172 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %ebp, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload ; X86-NEXT: setb (%esp) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %bl +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ebp, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: adcl %ebp, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %edi, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebp ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movzbl %bl, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movl (%esp), %edx ## 4-byte Reload +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %ecx, %esi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movl %edi, %esi -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload -; X86-NEXT: adcl %edi, %eax -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: movl %ebp, %ebx ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb %cl +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: setb %cl +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl %bl, %ecx -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: addl %edi, %ebx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: adcl $0, %edx @@ -1019,9 +1032,9 @@ ; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %edi, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: adcl $0, %esi @@ -1034,41 +1047,13 @@ ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx @@ -1077,89 +1062,117 @@ ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: setb %cl +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: addl %edi, %ebp ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: adcl %ebx, %edi ; X86-NEXT: setb (%esp) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %edi, %ebx ; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload -; X86-NEXT: movl %edx, %esi -; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload @@ -1175,25 +1188,25 @@ ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: addl %eax, %ebx ; X86-NEXT: movzbl %cl, %eax @@ -1201,76 +1214,75 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl (%esp), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: setb %al -; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %al, %edx ; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: movl (%esp), %eax ## 4-byte Reload +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: setb %al -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %edx, %ecx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: imull %edi, %esi -; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl %eax, %esi +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: andl %eax, %ebx ; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: subl %ebx, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl (%esp), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %ebp, %esi @@ -1280,263 +1292,266 @@ ; X86-NEXT: addl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %edx, %ebp -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: setb %bl ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: movzbl %bl, %ebx +; X86-NEXT: adcl %edx, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %edi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: subl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edi, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: movl (%esp), %edx ## 4-byte Reload +; X86-NEXT: andl %edi, %edx +; X86-NEXT: subl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %edi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl %edx, %esi -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: subl %edx, %esi +; X86-NEXT: andl {{[0-9]+}}(%esp), %edi +; X86-NEXT: subl %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: movl %eax, %edx ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: setb %cl -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %eax, %edx ; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %edi, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: movl (%esp), %esi ## 4-byte Reload +; X86-NEXT: addl %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: adcl %ebx, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edx, %esi +; X86-NEXT: adcl %edx, %edi ; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movzbl %bl, %ebx -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %eax, %edi +; X86-NEXT: movzbl %bl, %ebp +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: setb %cl -; X86-NEXT: addl %eax, %edi -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %edi, %edx +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %ebx, %eax -; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %edx +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl %eax, %edi ; X86-NEXT: setb %al -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: addl %ebp, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: imull %ebp, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %ebp, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: andl %edx, %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ebp, %ecx +; X86-NEXT: andl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ebp, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: addl %esi, %eax +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %dl ; X86-NEXT: addl %ebx, %edi -; X86-NEXT: movzbl %dl, %eax -; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movzbl %dl, %ecx +; X86-NEXT: adcl %ebp, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ebp, %ecx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: andl %ebp, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: imull %ebp, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edx, %esi +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: andl %ebp, %ecx +; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %ebp, %eax -; X86-NEXT: addl %esi, %eax -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: mull %ebp +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: subl %ecx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: subl %eax, %ebp +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: setb %cl +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl (%esp), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: xorl %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %edi +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: xorl %ecx, %esi ; X86-NEXT: orl %edx, %esi -; X86-NEXT: xorl %edi, %ecx -; X86-NEXT: orl %esi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: xorl %edi, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: xorl %edi, %ebp -; X86-NEXT: orl %eax, %ebp -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: orl %ebp, %edi -; X86-NEXT: orl %ecx, %edi +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: xorl %ecx, %ebx +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, 28(%eax) +; X86-NEXT: movl %ebp, 28(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -1552,7 +1567,7 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, 24(%eax) ; X86-NEXT: setne %al -; X86-NEXT: addl $156, %esp +; X86-NEXT: addl $152, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -3297,31 +3297,33 @@ ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: movq %r8, %r14 +; SSE2-NEXT: movq %rcx, %rbp ; SSE2-NEXT: movq %rdx, %r8 ; SSE2-NEXT: movq %rsi, %r11 ; SSE2-NEXT: movq %rdi, %r10 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE2-NEXT: movq %r11, %r12 -; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: movq %r14, %rbx -; SSE2-NEXT: imulq %r12, %rbx +; SSE2-NEXT: movq %r11, %rbx +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: movq %rbx, %r15 +; SSE2-NEXT: andq %r14, %r15 ; SSE2-NEXT: movq %r14, %rax -; SSE2-NEXT: mulq %r12 +; SSE2-NEXT: mulq %rbx ; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: addq %rbx, %rdx -; SSE2-NEXT: imulq %r9, %r12 -; SSE2-NEXT: addq %rdx, %r12 -; SSE2-NEXT: movq %r9, %rbx -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movq %rbx, %r13 -; SSE2-NEXT: imulq %r11, %r13 -; SSE2-NEXT: movq %rbx, %rax +; SSE2-NEXT: movq %rdx, %r12 +; SSE2-NEXT: subq %r15, %r12 +; SSE2-NEXT: andq %r9, %rbx +; SSE2-NEXT: subq %rbx, %r12 +; SSE2-NEXT: movq %r9, %r13 +; SSE2-NEXT: sarq $63, %r13 +; SSE2-NEXT: movq %r13, %rcx +; SSE2-NEXT: andq %r11, %rcx +; SSE2-NEXT: movq %r13, %rax ; SSE2-NEXT: mulq %r10 ; SSE2-NEXT: movq %rax, %r15 -; SSE2-NEXT: addq %r13, %rdx -; SSE2-NEXT: imulq %r10, %rbx -; SSE2-NEXT: addq %rdx, %rbx +; SSE2-NEXT: movq %rdx, %rbx +; SSE2-NEXT: subq %rcx, %rbx +; SSE2-NEXT: andq %r10, %r13 +; SSE2-NEXT: subq %r13, %rbx ; SSE2-NEXT: addq %rdi, %r15 ; SSE2-NEXT: adcq %r12, %rbx ; SSE2-NEXT: movq %r10, %rax @@ -3341,11 +3343,11 @@ ; SSE2-NEXT: addq %r13, %r10 ; SSE2-NEXT: adcq %r14, %r12 ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %r14d +; SSE2-NEXT: movzbl %al, %ecx ; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r9 ; SSE2-NEXT: addq %r12, %rax -; SSE2-NEXT: adcq %r14, %rdx +; SSE2-NEXT: adcq %rcx, %rdx ; SSE2-NEXT: addq %r15, %rax ; SSE2-NEXT: adcq %rbx, %rdx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 @@ -3356,52 +3358,56 @@ ; SSE2-NEXT: xorl %r15d, %r15d ; SSE2-NEXT: orq %rdx, %r10 ; SSE2-NEXT: setne %r15b -; SSE2-NEXT: movq %rcx, %rbx -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movq %rsi, %r10 -; SSE2-NEXT: imulq %rbx, %r10 +; SSE2-NEXT: movq %rbp, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movq %rcx, %r11 +; SSE2-NEXT: andq %rsi, %r11 ; SSE2-NEXT: movq %rsi, %rax -; SSE2-NEXT: mulq %rbx +; SSE2-NEXT: mulq %rcx ; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: addq %r10, %rdx -; SSE2-NEXT: imulq %rbp, %rbx -; SSE2-NEXT: addq %rdx, %rbx -; SSE2-NEXT: movq %rbp, %r10 -; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movq %r10, %r14 -; SSE2-NEXT: imulq %rcx, %r14 -; SSE2-NEXT: movq %r10, %rax -; SSE2-NEXT: mulq %r8 +; SSE2-NEXT: movq %rdx, %r10 +; SSE2-NEXT: subq %r11, %r10 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: andq %rax, %rcx +; SSE2-NEXT: subq %rcx, %r10 ; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: addq %r14, %rdx -; SSE2-NEXT: imulq %r8, %r10 -; SSE2-NEXT: addq %rdx, %r10 -; SSE2-NEXT: addq %r9, %r11 -; SSE2-NEXT: adcq %rbx, %r10 +; SSE2-NEXT: movq %rax, %r13 +; SSE2-NEXT: sarq $63, %r11 +; SSE2-NEXT: movq %r11, %rcx +; SSE2-NEXT: andq %rbp, %rcx +; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: mulq %r8 +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: movq %rdx, %r14 +; SSE2-NEXT: subq %rcx, %r14 +; SSE2-NEXT: andq %r8, %r11 +; SSE2-NEXT: subq %r11, %r14 +; SSE2-NEXT: addq %r9, %rbx +; SSE2-NEXT: adcq %r10, %r14 ; SSE2-NEXT: movq %r8, %rax ; SSE2-NEXT: mulq %rsi ; SSE2-NEXT: movq %rdx, %r9 -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: movq %rcx, %rax +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movq %rbp, %rax ; SSE2-NEXT: mulq %rsi ; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: addq %r9, %r14 +; SSE2-NEXT: movq %rax, %r11 +; SSE2-NEXT: addq %r9, %r11 ; SSE2-NEXT: adcq $0, %rsi ; SSE2-NEXT: movq %r8, %rax -; SSE2-NEXT: mulq %rbp +; SSE2-NEXT: mulq %r13 ; SSE2-NEXT: movq %rdx, %r8 ; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: addq %r14, %r9 +; SSE2-NEXT: addq %r11, %r9 ; SSE2-NEXT: adcq %rsi, %r8 ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %esi -; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: mulq %rbp +; SSE2-NEXT: movzbl %al, %ecx +; SSE2-NEXT: movq %rbp, %rax +; SSE2-NEXT: mulq %r13 ; SSE2-NEXT: addq %r8, %rax -; SSE2-NEXT: adcq %rsi, %rdx -; SSE2-NEXT: addq %r11, %rax -; SSE2-NEXT: adcq %r10, %rdx +; SSE2-NEXT: adcq %rcx, %rdx +; SSE2-NEXT: addq %rbx, %rax +; SSE2-NEXT: adcq %r14, %rdx ; SSE2-NEXT: movq %r9, 24(%r12) ; SSE2-NEXT: sarq $63, %r9 ; SSE2-NEXT: xorq %r9, %rdx @@ -3414,7 +3420,7 @@ ; SSE2-NEXT: negl %r15d ; SSE2-NEXT: movd %r15d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rbx, 16(%r12) +; SSE2-NEXT: movq %r10, 16(%r12) ; SSE2-NEXT: movq %rdi, (%r12) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 @@ -3433,31 +3439,33 @@ ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx ; SSSE3-NEXT: movq %r8, %r14 +; SSSE3-NEXT: movq %rcx, %rbp ; SSSE3-NEXT: movq %rdx, %r8 ; SSSE3-NEXT: movq %rsi, %r11 ; SSSE3-NEXT: movq %rdi, %r10 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSSE3-NEXT: movq %r11, %r12 -; SSSE3-NEXT: sarq $63, %r12 -; SSSE3-NEXT: movq %r14, %rbx -; SSSE3-NEXT: imulq %r12, %rbx +; SSSE3-NEXT: movq %r11, %rbx +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: movq %rbx, %r15 +; SSSE3-NEXT: andq %r14, %r15 ; SSSE3-NEXT: movq %r14, %rax -; SSSE3-NEXT: mulq %r12 +; SSSE3-NEXT: mulq %rbx ; SSSE3-NEXT: movq %rax, %rdi -; SSSE3-NEXT: addq %rbx, %rdx -; SSSE3-NEXT: imulq %r9, %r12 -; SSSE3-NEXT: addq %rdx, %r12 -; SSSE3-NEXT: movq %r9, %rbx -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movq %rbx, %r13 -; SSSE3-NEXT: imulq %r11, %r13 -; SSSE3-NEXT: movq %rbx, %rax +; SSSE3-NEXT: movq %rdx, %r12 +; SSSE3-NEXT: subq %r15, %r12 +; SSSE3-NEXT: andq %r9, %rbx +; SSSE3-NEXT: subq %rbx, %r12 +; SSSE3-NEXT: movq %r9, %r13 +; SSSE3-NEXT: sarq $63, %r13 +; SSSE3-NEXT: movq %r13, %rcx +; SSSE3-NEXT: andq %r11, %rcx +; SSSE3-NEXT: movq %r13, %rax ; SSSE3-NEXT: mulq %r10 ; SSSE3-NEXT: movq %rax, %r15 -; SSSE3-NEXT: addq %r13, %rdx -; SSSE3-NEXT: imulq %r10, %rbx -; SSSE3-NEXT: addq %rdx, %rbx +; SSSE3-NEXT: movq %rdx, %rbx +; SSSE3-NEXT: subq %rcx, %rbx +; SSSE3-NEXT: andq %r10, %r13 +; SSSE3-NEXT: subq %r13, %rbx ; SSSE3-NEXT: addq %rdi, %r15 ; SSSE3-NEXT: adcq %r12, %rbx ; SSSE3-NEXT: movq %r10, %rax @@ -3477,11 +3485,11 @@ ; SSSE3-NEXT: addq %r13, %r10 ; SSSE3-NEXT: adcq %r14, %r12 ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %r14d +; SSSE3-NEXT: movzbl %al, %ecx ; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r9 ; SSSE3-NEXT: addq %r12, %rax -; SSSE3-NEXT: adcq %r14, %rdx +; SSSE3-NEXT: adcq %rcx, %rdx ; SSSE3-NEXT: addq %r15, %rax ; SSSE3-NEXT: adcq %rbx, %rdx ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12 @@ -3492,52 +3500,56 @@ ; SSSE3-NEXT: xorl %r15d, %r15d ; SSSE3-NEXT: orq %rdx, %r10 ; SSSE3-NEXT: setne %r15b -; SSSE3-NEXT: movq %rcx, %rbx -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movq %rsi, %r10 -; SSSE3-NEXT: imulq %rbx, %r10 +; SSSE3-NEXT: movq %rbp, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movq %rcx, %r11 +; SSSE3-NEXT: andq %rsi, %r11 ; SSSE3-NEXT: movq %rsi, %rax -; SSSE3-NEXT: mulq %rbx +; SSSE3-NEXT: mulq %rcx ; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: addq %r10, %rdx -; SSSE3-NEXT: imulq %rbp, %rbx -; SSSE3-NEXT: addq %rdx, %rbx -; SSSE3-NEXT: movq %rbp, %r10 -; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: movq %r10, %r14 -; SSSE3-NEXT: imulq %rcx, %r14 -; SSSE3-NEXT: movq %r10, %rax -; SSSE3-NEXT: mulq %r8 +; SSSE3-NEXT: movq %rdx, %r10 +; SSSE3-NEXT: subq %r11, %r10 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSSE3-NEXT: andq %rax, %rcx +; SSSE3-NEXT: subq %rcx, %r10 ; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: addq %r14, %rdx -; SSSE3-NEXT: imulq %r8, %r10 -; SSSE3-NEXT: addq %rdx, %r10 -; SSSE3-NEXT: addq %r9, %r11 -; SSSE3-NEXT: adcq %rbx, %r10 +; SSSE3-NEXT: movq %rax, %r13 +; SSSE3-NEXT: sarq $63, %r11 +; SSSE3-NEXT: movq %r11, %rcx +; SSSE3-NEXT: andq %rbp, %rcx +; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: mulq %r8 +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: movq %rdx, %r14 +; SSSE3-NEXT: subq %rcx, %r14 +; SSSE3-NEXT: andq %r8, %r11 +; SSSE3-NEXT: subq %r11, %r14 +; SSSE3-NEXT: addq %r9, %rbx +; SSSE3-NEXT: adcq %r10, %r14 ; SSSE3-NEXT: movq %r8, %rax ; SSSE3-NEXT: mulq %rsi ; SSSE3-NEXT: movq %rdx, %r9 -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: movq %rcx, %rax +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movq %rbp, %rax ; SSSE3-NEXT: mulq %rsi ; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: addq %r9, %r14 +; SSSE3-NEXT: movq %rax, %r11 +; SSSE3-NEXT: addq %r9, %r11 ; SSSE3-NEXT: adcq $0, %rsi ; SSSE3-NEXT: movq %r8, %rax -; SSSE3-NEXT: mulq %rbp +; SSSE3-NEXT: mulq %r13 ; SSSE3-NEXT: movq %rdx, %r8 ; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: addq %r14, %r9 +; SSSE3-NEXT: addq %r11, %r9 ; SSSE3-NEXT: adcq %rsi, %r8 ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %esi -; SSSE3-NEXT: movq %rcx, %rax -; SSSE3-NEXT: mulq %rbp +; SSSE3-NEXT: movzbl %al, %ecx +; SSSE3-NEXT: movq %rbp, %rax +; SSSE3-NEXT: mulq %r13 ; SSSE3-NEXT: addq %r8, %rax -; SSSE3-NEXT: adcq %rsi, %rdx -; SSSE3-NEXT: addq %r11, %rax -; SSSE3-NEXT: adcq %r10, %rdx +; SSSE3-NEXT: adcq %rcx, %rdx +; SSSE3-NEXT: addq %rbx, %rax +; SSSE3-NEXT: adcq %r14, %rdx ; SSSE3-NEXT: movq %r9, 24(%r12) ; SSSE3-NEXT: sarq $63, %r9 ; SSSE3-NEXT: xorq %r9, %rdx @@ -3550,7 +3562,7 @@ ; SSSE3-NEXT: negl %r15d ; SSSE3-NEXT: movd %r15d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rbx, 16(%r12) +; SSSE3-NEXT: movq %r10, 16(%r12) ; SSSE3-NEXT: movq %rdi, (%r12) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 @@ -3569,31 +3581,33 @@ ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx ; SSE41-NEXT: movq %r8, %r14 +; SSE41-NEXT: movq %rcx, %rbp ; SSE41-NEXT: movq %rdx, %r8 ; SSE41-NEXT: movq %rsi, %r11 ; SSE41-NEXT: movq %rdi, %r10 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE41-NEXT: movq %r11, %r12 -; SSE41-NEXT: sarq $63, %r12 -; SSE41-NEXT: movq %r14, %rbx -; SSE41-NEXT: imulq %r12, %rbx +; SSE41-NEXT: movq %r11, %rbx +; SSE41-NEXT: sarq $63, %rbx +; SSE41-NEXT: movq %rbx, %r15 +; SSE41-NEXT: andq %r14, %r15 ; SSE41-NEXT: movq %r14, %rax -; SSE41-NEXT: mulq %r12 +; SSE41-NEXT: mulq %rbx ; SSE41-NEXT: movq %rax, %rdi -; SSE41-NEXT: addq %rbx, %rdx -; SSE41-NEXT: imulq %r9, %r12 -; SSE41-NEXT: addq %rdx, %r12 -; SSE41-NEXT: movq %r9, %rbx -; SSE41-NEXT: sarq $63, %rbx -; SSE41-NEXT: movq %rbx, %r13 -; SSE41-NEXT: imulq %r11, %r13 -; SSE41-NEXT: movq %rbx, %rax +; SSE41-NEXT: movq %rdx, %r12 +; SSE41-NEXT: subq %r15, %r12 +; SSE41-NEXT: andq %r9, %rbx +; SSE41-NEXT: subq %rbx, %r12 +; SSE41-NEXT: movq %r9, %r13 +; SSE41-NEXT: sarq $63, %r13 +; SSE41-NEXT: movq %r13, %rcx +; SSE41-NEXT: andq %r11, %rcx +; SSE41-NEXT: movq %r13, %rax ; SSE41-NEXT: mulq %r10 ; SSE41-NEXT: movq %rax, %r15 -; SSE41-NEXT: addq %r13, %rdx -; SSE41-NEXT: imulq %r10, %rbx -; SSE41-NEXT: addq %rdx, %rbx +; SSE41-NEXT: movq %rdx, %rbx +; SSE41-NEXT: subq %rcx, %rbx +; SSE41-NEXT: andq %r10, %r13 +; SSE41-NEXT: subq %r13, %rbx ; SSE41-NEXT: addq %rdi, %r15 ; SSE41-NEXT: adcq %r12, %rbx ; SSE41-NEXT: movq %r10, %rax @@ -3613,11 +3627,11 @@ ; SSE41-NEXT: addq %r13, %r10 ; SSE41-NEXT: adcq %r14, %r12 ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %r14d +; SSE41-NEXT: movzbl %al, %ecx ; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r9 ; SSE41-NEXT: addq %r12, %rax -; SSE41-NEXT: adcq %r14, %rdx +; SSE41-NEXT: adcq %rcx, %rdx ; SSE41-NEXT: addq %r15, %rax ; SSE41-NEXT: adcq %rbx, %rdx ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12 @@ -3628,52 +3642,56 @@ ; SSE41-NEXT: xorl %r15d, %r15d ; SSE41-NEXT: orq %rdx, %r10 ; SSE41-NEXT: setne %r15b -; SSE41-NEXT: movq %rcx, %rbx -; SSE41-NEXT: sarq $63, %rbx -; SSE41-NEXT: movq %rsi, %r10 -; SSE41-NEXT: imulq %rbx, %r10 +; SSE41-NEXT: movq %rbp, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: movq %rcx, %r11 +; SSE41-NEXT: andq %rsi, %r11 ; SSE41-NEXT: movq %rsi, %rax -; SSE41-NEXT: mulq %rbx +; SSE41-NEXT: mulq %rcx ; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: addq %r10, %rdx -; SSE41-NEXT: imulq %rbp, %rbx -; SSE41-NEXT: addq %rdx, %rbx -; SSE41-NEXT: movq %rbp, %r10 -; SSE41-NEXT: sarq $63, %r10 -; SSE41-NEXT: movq %r10, %r14 -; SSE41-NEXT: imulq %rcx, %r14 -; SSE41-NEXT: movq %r10, %rax -; SSE41-NEXT: mulq %r8 +; SSE41-NEXT: movq %rdx, %r10 +; SSE41-NEXT: subq %r11, %r10 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE41-NEXT: andq %rax, %rcx +; SSE41-NEXT: subq %rcx, %r10 ; SSE41-NEXT: movq %rax, %r11 -; SSE41-NEXT: addq %r14, %rdx -; SSE41-NEXT: imulq %r8, %r10 -; SSE41-NEXT: addq %rdx, %r10 -; SSE41-NEXT: addq %r9, %r11 -; SSE41-NEXT: adcq %rbx, %r10 +; SSE41-NEXT: movq %rax, %r13 +; SSE41-NEXT: sarq $63, %r11 +; SSE41-NEXT: movq %r11, %rcx +; SSE41-NEXT: andq %rbp, %rcx +; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: mulq %r8 +; SSE41-NEXT: movq %rax, %rbx +; SSE41-NEXT: movq %rdx, %r14 +; SSE41-NEXT: subq %rcx, %r14 +; SSE41-NEXT: andq %r8, %r11 +; SSE41-NEXT: subq %r11, %r14 +; SSE41-NEXT: addq %r9, %rbx +; SSE41-NEXT: adcq %r10, %r14 ; SSE41-NEXT: movq %r8, %rax ; SSE41-NEXT: mulq %rsi ; SSE41-NEXT: movq %rdx, %r9 -; SSE41-NEXT: movq %rax, %rbx -; SSE41-NEXT: movq %rcx, %rax +; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: movq %rbp, %rax ; SSE41-NEXT: mulq %rsi ; SSE41-NEXT: movq %rdx, %rsi -; SSE41-NEXT: movq %rax, %r14 -; SSE41-NEXT: addq %r9, %r14 +; SSE41-NEXT: movq %rax, %r11 +; SSE41-NEXT: addq %r9, %r11 ; SSE41-NEXT: adcq $0, %rsi ; SSE41-NEXT: movq %r8, %rax -; SSE41-NEXT: mulq %rbp +; SSE41-NEXT: mulq %r13 ; SSE41-NEXT: movq %rdx, %r8 ; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: addq %r14, %r9 +; SSE41-NEXT: addq %r11, %r9 ; SSE41-NEXT: adcq %rsi, %r8 ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %esi -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: mulq %rbp +; SSE41-NEXT: movzbl %al, %ecx +; SSE41-NEXT: movq %rbp, %rax +; SSE41-NEXT: mulq %r13 ; SSE41-NEXT: addq %r8, %rax -; SSE41-NEXT: adcq %rsi, %rdx -; SSE41-NEXT: addq %r11, %rax -; SSE41-NEXT: adcq %r10, %rdx +; SSE41-NEXT: adcq %rcx, %rdx +; SSE41-NEXT: addq %rbx, %rax +; SSE41-NEXT: adcq %r14, %rdx ; SSE41-NEXT: movq %r9, 24(%r12) ; SSE41-NEXT: sarq $63, %r9 ; SSE41-NEXT: xorq %r9, %rdx @@ -3685,7 +3703,7 @@ ; SSE41-NEXT: negl %r15d ; SSE41-NEXT: movd %r15d, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movq %rbx, 16(%r12) +; SSE41-NEXT: movq %r10, 16(%r12) ; SSE41-NEXT: movq %rdi, (%r12) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 @@ -3704,31 +3722,33 @@ ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx ; AVX-NEXT: movq %r8, %r14 +; AVX-NEXT: movq %rcx, %rbp ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: movq %rsi, %r11 ; AVX-NEXT: movq %rdi, %r10 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; AVX-NEXT: movq %r11, %r12 -; AVX-NEXT: sarq $63, %r12 -; AVX-NEXT: movq %r14, %rbx -; AVX-NEXT: imulq %r12, %rbx +; AVX-NEXT: movq %r11, %rbx +; AVX-NEXT: sarq $63, %rbx +; AVX-NEXT: movq %rbx, %r15 +; AVX-NEXT: andq %r14, %r15 ; AVX-NEXT: movq %r14, %rax -; AVX-NEXT: mulq %r12 +; AVX-NEXT: mulq %rbx ; AVX-NEXT: movq %rax, %rdi -; AVX-NEXT: addq %rbx, %rdx -; AVX-NEXT: imulq %r9, %r12 -; AVX-NEXT: addq %rdx, %r12 -; AVX-NEXT: movq %r9, %rbx -; AVX-NEXT: sarq $63, %rbx -; AVX-NEXT: movq %rbx, %r13 -; AVX-NEXT: imulq %r11, %r13 -; AVX-NEXT: movq %rbx, %rax +; AVX-NEXT: movq %rdx, %r12 +; AVX-NEXT: subq %r15, %r12 +; AVX-NEXT: andq %r9, %rbx +; AVX-NEXT: subq %rbx, %r12 +; AVX-NEXT: movq %r9, %r13 +; AVX-NEXT: sarq $63, %r13 +; AVX-NEXT: movq %r13, %rcx +; AVX-NEXT: andq %r11, %rcx +; AVX-NEXT: movq %r13, %rax ; AVX-NEXT: mulq %r10 ; AVX-NEXT: movq %rax, %r15 -; AVX-NEXT: addq %r13, %rdx -; AVX-NEXT: imulq %r10, %rbx -; AVX-NEXT: addq %rdx, %rbx +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: subq %rcx, %rbx +; AVX-NEXT: andq %r10, %r13 +; AVX-NEXT: subq %r13, %rbx ; AVX-NEXT: addq %rdi, %r15 ; AVX-NEXT: adcq %r12, %rbx ; AVX-NEXT: movq %r10, %rax @@ -3748,11 +3768,11 @@ ; AVX-NEXT: addq %r13, %r10 ; AVX-NEXT: adcq %r14, %r12 ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %r14d +; AVX-NEXT: movzbl %al, %ecx ; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r9 ; AVX-NEXT: addq %r12, %rax -; AVX-NEXT: adcq %r14, %rdx +; AVX-NEXT: adcq %rcx, %rdx ; AVX-NEXT: addq %r15, %rax ; AVX-NEXT: adcq %rbx, %rdx ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12 @@ -3763,52 +3783,56 @@ ; AVX-NEXT: xorl %r15d, %r15d ; AVX-NEXT: orq %rdx, %r10 ; AVX-NEXT: setne %r15b -; AVX-NEXT: movq %rcx, %rbx -; AVX-NEXT: sarq $63, %rbx -; AVX-NEXT: movq %rsi, %r10 -; AVX-NEXT: imulq %rbx, %r10 +; AVX-NEXT: movq %rbp, %rcx +; AVX-NEXT: sarq $63, %rcx +; AVX-NEXT: movq %rcx, %r11 +; AVX-NEXT: andq %rsi, %r11 ; AVX-NEXT: movq %rsi, %rax -; AVX-NEXT: mulq %rbx +; AVX-NEXT: mulq %rcx ; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: addq %r10, %rdx -; AVX-NEXT: imulq %rbp, %rbx -; AVX-NEXT: addq %rdx, %rbx -; AVX-NEXT: movq %rbp, %r10 -; AVX-NEXT: sarq $63, %r10 -; AVX-NEXT: movq %r10, %r14 -; AVX-NEXT: imulq %rcx, %r14 -; AVX-NEXT: movq %r10, %rax -; AVX-NEXT: mulq %r8 +; AVX-NEXT: movq %rdx, %r10 +; AVX-NEXT: subq %r11, %r10 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX-NEXT: andq %rax, %rcx +; AVX-NEXT: subq %rcx, %r10 ; AVX-NEXT: movq %rax, %r11 -; AVX-NEXT: addq %r14, %rdx -; AVX-NEXT: imulq %r8, %r10 -; AVX-NEXT: addq %rdx, %r10 -; AVX-NEXT: addq %r9, %r11 -; AVX-NEXT: adcq %rbx, %r10 +; AVX-NEXT: movq %rax, %r13 +; AVX-NEXT: sarq $63, %r11 +; AVX-NEXT: movq %r11, %rcx +; AVX-NEXT: andq %rbp, %rcx +; AVX-NEXT: movq %r11, %rax +; AVX-NEXT: mulq %r8 +; AVX-NEXT: movq %rax, %rbx +; AVX-NEXT: movq %rdx, %r14 +; AVX-NEXT: subq %rcx, %r14 +; AVX-NEXT: andq %r8, %r11 +; AVX-NEXT: subq %r11, %r14 +; AVX-NEXT: addq %r9, %rbx +; AVX-NEXT: adcq %r10, %r14 ; AVX-NEXT: movq %r8, %rax ; AVX-NEXT: mulq %rsi ; AVX-NEXT: movq %rdx, %r9 -; AVX-NEXT: movq %rax, %rbx -; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: movq %rbp, %rax ; AVX-NEXT: mulq %rsi ; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: movq %rax, %r14 -; AVX-NEXT: addq %r9, %r14 +; AVX-NEXT: movq %rax, %r11 +; AVX-NEXT: addq %r9, %r11 ; AVX-NEXT: adcq $0, %rsi ; AVX-NEXT: movq %r8, %rax -; AVX-NEXT: mulq %rbp +; AVX-NEXT: mulq %r13 ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: addq %r14, %r9 +; AVX-NEXT: addq %r11, %r9 ; AVX-NEXT: adcq %rsi, %r8 ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %esi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: mulq %rbp +; AVX-NEXT: movzbl %al, %ecx +; AVX-NEXT: movq %rbp, %rax +; AVX-NEXT: mulq %r13 ; AVX-NEXT: addq %r8, %rax -; AVX-NEXT: adcq %rsi, %rdx -; AVX-NEXT: addq %r11, %rax -; AVX-NEXT: adcq %r10, %rdx +; AVX-NEXT: adcq %rcx, %rdx +; AVX-NEXT: addq %rbx, %rax +; AVX-NEXT: adcq %r14, %rdx ; AVX-NEXT: movq %r9, 24(%r12) ; AVX-NEXT: sarq $63, %r9 ; AVX-NEXT: xorq %r9, %rdx @@ -3820,7 +3844,7 @@ ; AVX-NEXT: negl %r15d ; AVX-NEXT: vmovd %r15d, %xmm0 ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %rbx, 16(%r12) +; AVX-NEXT: movq %r10, 16(%r12) ; AVX-NEXT: movq %rdi, (%r12) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 @@ -3838,32 +3862,35 @@ ; AVX512F-NEXT: pushq %r13 ; AVX512F-NEXT: pushq %r12 ; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: movq %r9, %rbp +; AVX512F-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512F-NEXT: movq %rcx, %r11 ; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq %rsi, %r9 +; AVX512F-NEXT: movq %rsi, %rbp +; AVX512F-NEXT: movq %rdi, %r9 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512F-NEXT: movq %rcx, %r12 -; AVX512F-NEXT: sarq $63, %r12 -; AVX512F-NEXT: movq %r15, %rbx -; AVX512F-NEXT: imulq %r12, %rbx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512F-NEXT: movq %rcx, %rbx +; AVX512F-NEXT: sarq $63, %rbx +; AVX512F-NEXT: movq %rbx, %r14 +; AVX512F-NEXT: andq %r15, %r14 ; AVX512F-NEXT: movq %r15, %rax -; AVX512F-NEXT: mulq %r12 +; AVX512F-NEXT: mulq %rbx ; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: addq %rbx, %rdx -; AVX512F-NEXT: imulq %rsi, %r12 -; AVX512F-NEXT: addq %rdx, %r12 -; AVX512F-NEXT: movq %rsi, %rbx -; AVX512F-NEXT: sarq $63, %rbx -; AVX512F-NEXT: movq %rbx, %r13 -; AVX512F-NEXT: imulq %r11, %r13 -; AVX512F-NEXT: movq %rbx, %rax +; AVX512F-NEXT: movq %rdx, %r12 +; AVX512F-NEXT: subq %r14, %r12 +; AVX512F-NEXT: andq %rdi, %rbx +; AVX512F-NEXT: subq %rbx, %r12 +; AVX512F-NEXT: movq %rdi, %r13 +; AVX512F-NEXT: sarq $63, %r13 +; AVX512F-NEXT: movq %r13, %rsi +; AVX512F-NEXT: andq %r11, %rsi +; AVX512F-NEXT: movq %r13, %rax ; AVX512F-NEXT: mulq %r10 ; AVX512F-NEXT: movq %rax, %r14 -; AVX512F-NEXT: addq %r13, %rdx -; AVX512F-NEXT: imulq %r10, %rbx -; AVX512F-NEXT: addq %rdx, %rbx +; AVX512F-NEXT: movq %rdx, %rbx +; AVX512F-NEXT: subq %rsi, %rbx +; AVX512F-NEXT: andq %r10, %r13 +; AVX512F-NEXT: subq %r13, %rbx ; AVX512F-NEXT: addq %rcx, %r14 ; AVX512F-NEXT: adcq %r12, %rbx ; AVX512F-NEXT: movq %r10, %rax @@ -3877,74 +3904,78 @@ ; AVX512F-NEXT: addq %r12, %r13 ; AVX512F-NEXT: adcq $0, %r15 ; AVX512F-NEXT: movq %r10, %rax -; AVX512F-NEXT: mulq %rsi +; AVX512F-NEXT: mulq %rdi ; AVX512F-NEXT: movq %rdx, %r12 ; AVX512F-NEXT: movq %rax, %r10 ; AVX512F-NEXT: addq %r13, %r10 ; AVX512F-NEXT: adcq %r15, %r12 ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: movzbl %al, %r15d +; AVX512F-NEXT: movzbl %al, %esi ; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %rsi +; AVX512F-NEXT: mulq %rdi ; AVX512F-NEXT: addq %r12, %rax -; AVX512F-NEXT: adcq %r15, %rdx +; AVX512F-NEXT: adcq %rsi, %rdx ; AVX512F-NEXT: addq %r14, %rax ; AVX512F-NEXT: adcq %rbx, %rdx -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512F-NEXT: movq %r10, 24(%r12) +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX512F-NEXT: movq %r10, 24(%r13) ; AVX512F-NEXT: sarq $63, %r10 ; AVX512F-NEXT: xorq %r10, %rdx ; AVX512F-NEXT: xorq %rax, %r10 ; AVX512F-NEXT: orq %rdx, %r10 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: movq %r9, %rsi +; AVX512F-NEXT: movq %rbp, %rsi ; AVX512F-NEXT: sarq $63, %rsi -; AVX512F-NEXT: movq %r8, %r11 -; AVX512F-NEXT: imulq %rsi, %r11 +; AVX512F-NEXT: movq %rsi, %rdi +; AVX512F-NEXT: andq %r8, %rdi ; AVX512F-NEXT: movq %r8, %rax ; AVX512F-NEXT: mulq %rsi ; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: addq %r11, %rdx -; AVX512F-NEXT: imulq %rbp, %rsi -; AVX512F-NEXT: addq %rdx, %rsi -; AVX512F-NEXT: movq %rbp, %r11 -; AVX512F-NEXT: sarq $63, %r11 -; AVX512F-NEXT: movq %r11, %r14 -; AVX512F-NEXT: imulq %r9, %r14 -; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %rdi +; AVX512F-NEXT: movq %rdx, %r11 +; AVX512F-NEXT: subq %rdi, %r11 +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512F-NEXT: andq %rax, %rsi +; AVX512F-NEXT: subq %rsi, %r11 ; AVX512F-NEXT: movq %rax, %rbx -; AVX512F-NEXT: addq %r14, %rdx -; AVX512F-NEXT: imulq %rdi, %r11 -; AVX512F-NEXT: addq %rdx, %r11 -; AVX512F-NEXT: addq %r10, %rbx -; AVX512F-NEXT: adcq %rsi, %r11 -; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: mulq %r8 -; AVX512F-NEXT: movq %rdx, %r10 +; AVX512F-NEXT: movq %rax, %r12 +; AVX512F-NEXT: sarq $63, %rbx +; AVX512F-NEXT: movq %rbx, %rsi +; AVX512F-NEXT: andq %rbp, %rsi +; AVX512F-NEXT: movq %rbx, %rax +; AVX512F-NEXT: mulq %r9 ; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: movq %rdx, %r15 +; AVX512F-NEXT: subq %rsi, %r15 +; AVX512F-NEXT: andq %r9, %rbx +; AVX512F-NEXT: subq %rbx, %r15 +; AVX512F-NEXT: addq %r10, %r14 +; AVX512F-NEXT: adcq %r11, %r15 ; AVX512F-NEXT: movq %r9, %rax ; AVX512F-NEXT: mulq %r8 +; AVX512F-NEXT: movq %rdx, %r10 +; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: movq %rbp, %rax +; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %r8 -; AVX512F-NEXT: movq %rax, %r15 -; AVX512F-NEXT: addq %r10, %r15 +; AVX512F-NEXT: movq %rax, %rbx +; AVX512F-NEXT: addq %r10, %rbx ; AVX512F-NEXT: adcq $0, %r8 -; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: mulq %rbp +; AVX512F-NEXT: movq %r9, %rax +; AVX512F-NEXT: mulq %r12 ; AVX512F-NEXT: movq %rdx, %rdi ; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: addq %r15, %r10 +; AVX512F-NEXT: addq %rbx, %r10 ; AVX512F-NEXT: adcq %r8, %rdi ; AVX512F-NEXT: setb %al ; AVX512F-NEXT: movzbl %al, %esi -; AVX512F-NEXT: movq %r9, %rax -; AVX512F-NEXT: mulq %rbp +; AVX512F-NEXT: movq %rbp, %rax +; AVX512F-NEXT: mulq %r12 ; AVX512F-NEXT: addq %rdi, %rax ; AVX512F-NEXT: adcq %rsi, %rdx -; AVX512F-NEXT: addq %rbx, %rax -; AVX512F-NEXT: adcq %r11, %rdx -; AVX512F-NEXT: movq %r10, 8(%r12) +; AVX512F-NEXT: addq %r14, %rax +; AVX512F-NEXT: adcq %r15, %rdx +; AVX512F-NEXT: movq %r10, 8(%r13) ; AVX512F-NEXT: sarq $63, %r10 ; AVX512F-NEXT: xorq %r10, %rdx ; AVX512F-NEXT: xorq %rax, %r10 @@ -3956,8 +3987,8 @@ ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512F-NEXT: movq %rcx, 16(%r12) -; AVX512F-NEXT: movq %r14, (%r12) +; AVX512F-NEXT: movq %rcx, 16(%r13) +; AVX512F-NEXT: movq %r11, (%r13) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r13 @@ -3974,32 +4005,35 @@ ; AVX512BW-NEXT: pushq %r13 ; AVX512BW-NEXT: pushq %r12 ; AVX512BW-NEXT: pushq %rbx -; AVX512BW-NEXT: movq %r9, %rbp +; AVX512BW-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512BW-NEXT: movq %rcx, %r11 ; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq %rsi, %r9 +; AVX512BW-NEXT: movq %rsi, %rbp +; AVX512BW-NEXT: movq %rdi, %r9 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512BW-NEXT: movq %rcx, %r12 -; AVX512BW-NEXT: sarq $63, %r12 -; AVX512BW-NEXT: movq %r15, %rbx -; AVX512BW-NEXT: imulq %r12, %rbx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512BW-NEXT: movq %rcx, %rbx +; AVX512BW-NEXT: sarq $63, %rbx +; AVX512BW-NEXT: movq %rbx, %r14 +; AVX512BW-NEXT: andq %r15, %r14 ; AVX512BW-NEXT: movq %r15, %rax -; AVX512BW-NEXT: mulq %r12 +; AVX512BW-NEXT: mulq %rbx ; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: addq %rbx, %rdx -; AVX512BW-NEXT: imulq %rsi, %r12 -; AVX512BW-NEXT: addq %rdx, %r12 -; AVX512BW-NEXT: movq %rsi, %rbx -; AVX512BW-NEXT: sarq $63, %rbx -; AVX512BW-NEXT: movq %rbx, %r13 -; AVX512BW-NEXT: imulq %r11, %r13 -; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: movq %rdx, %r12 +; AVX512BW-NEXT: subq %r14, %r12 +; AVX512BW-NEXT: andq %rdi, %rbx +; AVX512BW-NEXT: subq %rbx, %r12 +; AVX512BW-NEXT: movq %rdi, %r13 +; AVX512BW-NEXT: sarq $63, %r13 +; AVX512BW-NEXT: movq %r13, %rsi +; AVX512BW-NEXT: andq %r11, %rsi +; AVX512BW-NEXT: movq %r13, %rax ; AVX512BW-NEXT: mulq %r10 ; AVX512BW-NEXT: movq %rax, %r14 -; AVX512BW-NEXT: addq %r13, %rdx -; AVX512BW-NEXT: imulq %r10, %rbx -; AVX512BW-NEXT: addq %rdx, %rbx +; AVX512BW-NEXT: movq %rdx, %rbx +; AVX512BW-NEXT: subq %rsi, %rbx +; AVX512BW-NEXT: andq %r10, %r13 +; AVX512BW-NEXT: subq %r13, %rbx ; AVX512BW-NEXT: addq %rcx, %r14 ; AVX512BW-NEXT: adcq %r12, %rbx ; AVX512BW-NEXT: movq %r10, %rax @@ -4013,74 +4047,78 @@ ; AVX512BW-NEXT: addq %r12, %r13 ; AVX512BW-NEXT: adcq $0, %r15 ; AVX512BW-NEXT: movq %r10, %rax -; AVX512BW-NEXT: mulq %rsi +; AVX512BW-NEXT: mulq %rdi ; AVX512BW-NEXT: movq %rdx, %r12 ; AVX512BW-NEXT: movq %rax, %r10 ; AVX512BW-NEXT: addq %r13, %r10 ; AVX512BW-NEXT: adcq %r15, %r12 ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: movzbl %al, %r15d +; AVX512BW-NEXT: movzbl %al, %esi ; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %rsi +; AVX512BW-NEXT: mulq %rdi ; AVX512BW-NEXT: addq %r12, %rax -; AVX512BW-NEXT: adcq %r15, %rdx +; AVX512BW-NEXT: adcq %rsi, %rdx ; AVX512BW-NEXT: addq %r14, %rax ; AVX512BW-NEXT: adcq %rbx, %rdx -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512BW-NEXT: movq %r10, 24(%r12) +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX512BW-NEXT: movq %r10, 24(%r13) ; AVX512BW-NEXT: sarq $63, %r10 ; AVX512BW-NEXT: xorq %r10, %rdx ; AVX512BW-NEXT: xorq %rax, %r10 ; AVX512BW-NEXT: orq %rdx, %r10 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: movq %r9, %rsi +; AVX512BW-NEXT: movq %rbp, %rsi ; AVX512BW-NEXT: sarq $63, %rsi -; AVX512BW-NEXT: movq %r8, %r11 -; AVX512BW-NEXT: imulq %rsi, %r11 +; AVX512BW-NEXT: movq %rsi, %rdi +; AVX512BW-NEXT: andq %r8, %rdi ; AVX512BW-NEXT: movq %r8, %rax ; AVX512BW-NEXT: mulq %rsi ; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: addq %r11, %rdx -; AVX512BW-NEXT: imulq %rbp, %rsi -; AVX512BW-NEXT: addq %rdx, %rsi -; AVX512BW-NEXT: movq %rbp, %r11 -; AVX512BW-NEXT: sarq $63, %r11 -; AVX512BW-NEXT: movq %r11, %r14 -; AVX512BW-NEXT: imulq %r9, %r14 -; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %rdi +; AVX512BW-NEXT: movq %rdx, %r11 +; AVX512BW-NEXT: subq %rdi, %r11 +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: andq %rax, %rsi +; AVX512BW-NEXT: subq %rsi, %r11 ; AVX512BW-NEXT: movq %rax, %rbx -; AVX512BW-NEXT: addq %r14, %rdx -; AVX512BW-NEXT: imulq %rdi, %r11 -; AVX512BW-NEXT: addq %rdx, %r11 -; AVX512BW-NEXT: addq %r10, %rbx -; AVX512BW-NEXT: adcq %rsi, %r11 -; AVX512BW-NEXT: movq %rdi, %rax -; AVX512BW-NEXT: mulq %r8 -; AVX512BW-NEXT: movq %rdx, %r10 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: sarq $63, %rbx +; AVX512BW-NEXT: movq %rbx, %rsi +; AVX512BW-NEXT: andq %rbp, %rsi +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: mulq %r9 ; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: movq %rdx, %r15 +; AVX512BW-NEXT: subq %rsi, %r15 +; AVX512BW-NEXT: andq %r9, %rbx +; AVX512BW-NEXT: subq %rbx, %r15 +; AVX512BW-NEXT: addq %r10, %r14 +; AVX512BW-NEXT: adcq %r11, %r15 ; AVX512BW-NEXT: movq %r9, %rax ; AVX512BW-NEXT: mulq %r8 +; AVX512BW-NEXT: movq %rdx, %r10 +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %r8 -; AVX512BW-NEXT: movq %rax, %r15 -; AVX512BW-NEXT: addq %r10, %r15 +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: addq %r10, %rbx ; AVX512BW-NEXT: adcq $0, %r8 -; AVX512BW-NEXT: movq %rdi, %rax -; AVX512BW-NEXT: mulq %rbp +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: mulq %r12 ; AVX512BW-NEXT: movq %rdx, %rdi ; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: addq %r15, %r10 +; AVX512BW-NEXT: addq %rbx, %r10 ; AVX512BW-NEXT: adcq %r8, %rdi ; AVX512BW-NEXT: setb %al ; AVX512BW-NEXT: movzbl %al, %esi -; AVX512BW-NEXT: movq %r9, %rax -; AVX512BW-NEXT: mulq %rbp +; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: mulq %r12 ; AVX512BW-NEXT: addq %rdi, %rax ; AVX512BW-NEXT: adcq %rsi, %rdx -; AVX512BW-NEXT: addq %rbx, %rax -; AVX512BW-NEXT: adcq %r11, %rdx -; AVX512BW-NEXT: movq %r10, 8(%r12) +; AVX512BW-NEXT: addq %r14, %rax +; AVX512BW-NEXT: adcq %r15, %rdx +; AVX512BW-NEXT: movq %r10, 8(%r13) ; AVX512BW-NEXT: sarq $63, %r10 ; AVX512BW-NEXT: xorq %r10, %rdx ; AVX512BW-NEXT: xorq %rax, %r10 @@ -4092,8 +4130,8 @@ ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512BW-NEXT: movq %rcx, 16(%r12) -; AVX512BW-NEXT: movq %r14, (%r12) +; AVX512BW-NEXT: movq %rcx, 16(%r13) +; AVX512BW-NEXT: movq %r11, (%r13) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 ; AVX512BW-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll --- a/llvm/test/CodeGen/X86/xmulo.ll +++ b/llvm/test/CodeGen/X86/xmulo.ll @@ -215,35 +215,36 @@ ; WIN32-NEXT: subl $8, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: sarl $31, %edi -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: imull %edi, %esi -; WIN32-NEXT: mull %edi -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: addl %esi, %edx -; WIN32-NEXT: movl %ebp, %esi -; WIN32-NEXT: imull %ebp, %edi -; WIN32-NEXT: addl %edx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %ebx, %esi ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl %esi, %ebp -; WIN32-NEXT: imull %ecx, %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: addl %ebp, %edx -; WIN32-NEXT: imull %ecx, %esi -; WIN32-NEXT: addl %edx, %esi -; WIN32-NEXT: addl %ebx, %eax +; WIN32-NEXT: movl %esi, %edi +; WIN32-NEXT: andl %eax, %edi +; WIN32-NEXT: mull %esi ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %edi, %esi -; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: subl %edi, %ecx +; WIN32-NEXT: andl %ebp, %esi +; WIN32-NEXT: subl %esi, %ecx +; WIN32-NEXT: sarl $31, %ebp +; WIN32-NEXT: movl %ebp, %edi +; WIN32-NEXT: andl %ebx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull %ebx +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: subl %edi, %esi +; WIN32-NEXT: andl %ebx, %ebp +; WIN32-NEXT: subl %ebp, %esi +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: adcl %ecx, %esi +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: movl %ebx, %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebx @@ -262,7 +263,7 @@ ; WIN32-NEXT: addl %edi, %eax ; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; WIN32-NEXT: adcl %esi, %edx ; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: sarl $31, %ecx @@ -271,7 +272,7 @@ ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %ebp, 4(%eax) -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al ; WIN32-NEXT: addl $8, %esp @@ -573,49 +574,52 @@ ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: pushl %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ebp, %edi -; WIN32-NEXT: imull %ecx, %edi -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: andl %eax, %edi ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: addl %edi, %edx -; WIN32-NEXT: imull %ebx, %ecx -; WIN32-NEXT: addl %edx, %ecx -; WIN32-NEXT: sarl $31, %ebx -; WIN32-NEXT: movl %ebx, %edi -; WIN32-NEXT: imull %esi, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: mull %esi -; WIN32-NEXT: addl %edi, %edx -; WIN32-NEXT: movl %esi, %edi -; WIN32-NEXT: imull %esi, %ebx -; WIN32-NEXT: addl %edx, %ebx +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: subl %edi, %esi +; WIN32-NEXT: andl %ebx, %ecx +; WIN32-NEXT: subl %ecx, %esi +; WIN32-NEXT: movl %ebx, %ecx +; WIN32-NEXT: sarl $31, %ecx +; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: andl %ebp, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: subl %edi, %ebx +; WIN32-NEXT: movl %ebp, %edi +; WIN32-NEXT: andl %ebp, %ecx +; WIN32-NEXT: subl %ecx, %ebx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ecx, %ebx +; WIN32-NEXT: adcl %esi, %ebx ; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %ebp -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %esi, %ebp -; WIN32-NEXT: adcl $0, %ecx +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl %esi, %ecx +; WIN32-NEXT: adcl $0, %ebp ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %ebp, %esi +; WIN32-NEXT: addl %ecx, %esi +; WIN32-NEXT: adcl %ebp, %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: adcl %ecx, %edi ; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) @@ -999,30 +1003,32 @@ ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: sarl $31, %edi -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: imull %edi, %esi -; WIN32-NEXT: mull %edi -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %esi, %edx -; WIN32-NEXT: movl %ebx, %esi -; WIN32-NEXT: imull %ebx, %edi -; WIN32-NEXT: addl %edx, %edi +; WIN32-NEXT: movl %ecx, %esi +; WIN32-NEXT: movl %ecx, %ebp ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl %esi, %ebx -; WIN32-NEXT: imull %ecx, %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: addl %ebx, %edx -; WIN32-NEXT: imull %ecx, %esi -; WIN32-NEXT: addl %edx, %esi -; WIN32-NEXT: addl %ebp, %eax +; WIN32-NEXT: movl %esi, %edi +; WIN32-NEXT: andl %eax, %edi +; WIN32-NEXT: mull %esi ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %edi, %esi -; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: subl %edi, %ecx +; WIN32-NEXT: andl %ebx, %esi +; WIN32-NEXT: subl %esi, %ecx +; WIN32-NEXT: sarl $31, %ebx +; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: andl %ebp, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: subl %edi, %esi +; WIN32-NEXT: andl %ebp, %ebx +; WIN32-NEXT: subl %ebx, %esi +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: adcl %ecx, %esi +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl %ebp, %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebx @@ -1704,57 +1710,62 @@ ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: subl $16, %esp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl (%eax), %esi -; WIN32-NEXT: movl 4(%eax), %ebp -; WIN32-NEXT: sarl $31, %ebx -; WIN32-NEXT: movl %ebx, %ecx -; WIN32-NEXT: imull %ebp, %ecx -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl 4(%eax), %eax +; WIN32-NEXT: sarl $31, %edi +; WIN32-NEXT: movl %edi, %ecx +; WIN32-NEXT: andl %eax, %ecx +; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: addl %ecx, %edx +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: subl %ecx, %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %esi, %ebx -; WIN32-NEXT: addl %edx, %ebx -; WIN32-NEXT: movl %ebp, %ecx -; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: andl %esi, %edi +; WIN32-NEXT: subl %edi, %ebp +; WIN32-NEXT: movl %ebx, %ecx ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: imull %ecx, %edi +; WIN32-NEXT: movl %ecx, %ebx +; WIN32-NEXT: andl %eax, %ebx ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: addl %edi, %edx -; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: addl %edx, %ecx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: subl %ebx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: andl %edx, %ecx +; WIN32-NEXT: subl %ecx, %edi +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %ebx, %ecx +; WIN32-NEXT: adcl %ebp, %edi ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: mull %edi -; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: addl %ebx, %edi +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; WIN32-NEXT: adcl $0, %ebp ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %edi, %esi -; WIN32-NEXT: adcl %ebp, %ebx -; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; WIN32-NEXT: addl %ebx, %esi +; WIN32-NEXT: adcl %ebp, %ecx +; WIN32-NEXT: setb %bl ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %ebx, %eax -; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; WIN32-NEXT: adcl %edi, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: movzbl %bl, %ecx ; WIN32-NEXT: adcl %ecx, %edx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: adcl %edi, %edx ; WIN32-NEXT: movl %esi, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx @@ -1762,7 +1773,7 @@ ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %esi, 4(%eax) -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al ; WIN32-NEXT: addl $16, %esp @@ -1810,35 +1821,35 @@ ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: subl $12, %esp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl (%eax), %ebp ; WIN32-NEXT: movl 4(%eax), %ebx -; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: sarl $31, %edi -; WIN32-NEXT: movl %ebp, %esi -; WIN32-NEXT: imull %edi, %esi +; WIN32-NEXT: sarl $31, %esi +; WIN32-NEXT: movl %esi, %edi +; WIN32-NEXT: andl %ebp, %edi ; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull %edi +; WIN32-NEXT: mull %esi ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: addl %esi, %edx -; WIN32-NEXT: movl %ebx, %esi +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: subl %edi, %ecx ; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: imull %ebx, %edi -; WIN32-NEXT: addl %edx, %edi -; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl %esi, %ebx -; WIN32-NEXT: imull %ecx, %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: addl %ebx, %edx -; WIN32-NEXT: imull %ecx, %esi -; WIN32-NEXT: addl %edx, %esi +; WIN32-NEXT: andl %ebx, %esi +; WIN32-NEXT: subl %esi, %ecx +; WIN32-NEXT: sarl $31, %ebx +; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: andl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: subl %edi, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: andl %edx, %ebx +; WIN32-NEXT: subl %ebx, %esi ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %edi, %esi -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: adcl %ecx, %esi +; WIN32-NEXT: movl %edx, %eax ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill