diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3947,6 +3947,24 @@ return SDValue(); } +// Fold (mul (sra X, BW-1), Y) -> (neg (and (sra X, BW-1), Y)) +static SDValue foldSraMulToAndNeg(SDNode *N, SDValue N0, SDValue N1, + SelectionDAG &DAG) { + if (N0.getOpcode() != ISD::SRA) + return SDValue(); + + EVT VT = N->getValueType(0); + + unsigned BitWidth = VT.getScalarSizeInBits(); + ConstantSDNode *ShiftAmt = isConstOrConstSplat(N0.getOperand(1)); + if (!ShiftAmt || ShiftAmt->getAPIntValue() != (BitWidth - 1)) + return SDValue(); + + SDLoc dl(N); + SDValue And = DAG.getNode(ISD::AND, dl, VT, N0, N1); + return DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), And); +} + SDValue DAGCombiner::visitMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -4158,6 +4176,11 @@ } } + if (SDValue V = foldSraMulToAndNeg(N, N0, N1, DAG)) + return V; + if (SDValue V = foldSraMulToAndNeg(N, N1, N0, DAG)) + return V; + // reassociate mul if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags())) return RMUL; diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll @@ -39,21 +39,24 @@ ; AARCH: // %bb.0: // %Entry ; AARCH-NEXT: asr x9, x1, #63 ; AARCH-NEXT: asr x10, x3, #63 +; AARCH-NEXT: and x11, x9, x2 +; AARCH-NEXT: and x14, x10, x1 +; AARCH-NEXT: umulh x12, x2, x9 +; AARCH-NEXT: and x9, x9, x3 +; AARCH-NEXT: umulh x13, x10, x0 +; AARCH-NEXT: and x10, x10, x0 +; AARCH-NEXT: sub x12, x12, x11 +; AARCH-NEXT: neg x11, x11 +; AARCH-NEXT: sub x13, x13, x14 +; AARCH-NEXT: sub x9, x12, x9 +; AARCH-NEXT: sub x12, x13, x10 +; AARCH-NEXT: neg x10, x10 ; AARCH-NEXT: umulh x14, x0, x2 -; AARCH-NEXT: mov x8, x1 -; AARCH-NEXT: mul x11, x2, x9 -; AARCH-NEXT: str wzr, [x4] -; AARCH-NEXT: umulh x12, x10, x0 -; AARCH-NEXT: umulh x13, x2, x9 -; AARCH-NEXT: madd x12, x10, x1, x12 -; AARCH-NEXT: add x13, x13, x11 -; AARCH-NEXT: mul x10, x10, x0 -; AARCH-NEXT: madd x9, x3, x9, x13 -; AARCH-NEXT: add x12, x12, x10 ; AARCH-NEXT: adds x10, x10, x11 ; AARCH-NEXT: mul x11, x1, x2 ; AARCH-NEXT: adc x9, x12, x9 ; AARCH-NEXT: umulh x13, x1, x2 +; AARCH-NEXT: mov x8, x1 ; AARCH-NEXT: mul x12, x0, x3 ; AARCH-NEXT: adds x11, x11, x14 ; AARCH-NEXT: umulh x14, x0, x3 @@ -73,6 +76,7 @@ ; AARCH-NEXT: eor x9, x9, x11 ; AARCH-NEXT: eor x10, x10, x11 ; AARCH-NEXT: orr x9, x10, x9 +; AARCH-NEXT: str wzr, [x4] ; AARCH-NEXT: cmp x9, #0 ; AARCH-NEXT: cset w9, ne ; AARCH-NEXT: tbz x8, #63, .LBB1_2 diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -159,24 +159,28 @@ ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0 -; CI-NEXT: v_ashrrev_i32_e32 v13, 31, v0 +; CI-NEXT: v_ashrrev_i32_e32 v11, 31, v0 ; CI-NEXT: v_mov_b32_e32 v8, 0 -; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v1, v[7:8] -; CI-NEXT: v_ashrrev_i32_e32 v14, 31, v1 -; CI-NEXT: v_mad_i64_i32 v[11:12], s[4:5], v1, v13, 0 -; CI-NEXT: v_mov_b32_e32 v7, v10 +; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11, v1, v[7:8] +; CI-NEXT: v_ashrrev_i32_e32 v12, 31, v1 +; CI-NEXT: v_and_b32_e32 v14, v11, v1 +; CI-NEXT: v_mov_b32_e32 v1, v10 ; CI-NEXT: v_mov_b32_e32 v10, v8 -; CI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10] -; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12] -; CI-NEXT: v_add_i32_e32 v9, vcc, v7, v9 -; CI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc -; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10] -; CI-NEXT: v_add_i32_e32 v7, vcc, v9, v0 -; CI-NEXT: v_addc_u32_e32 v9, vcc, v10, v1, vcc -; CI-NEXT: v_mov_b32_e32 v1, v8 +; CI-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v12, v[9:10] +; CI-NEXT: v_and_b32_e32 v13, v11, v12 +; CI-NEXT: v_sub_i32_e32 v9, vcc, 0, v14 +; CI-NEXT: v_subb_u32_e32 v10, vcc, 0, v13, vcc +; CI-NEXT: v_mad_i64_i32 v[9:10], s[4:5], v12, v0, v[9:10] +; CI-NEXT: v_mov_b32_e32 v0, v8 +; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CI-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc +; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v12, v[0:1] +; CI-NEXT: v_add_i32_e32 v8, vcc, v0, v9 +; CI-NEXT: v_addc_u32_e32 v9, vcc, v1, v10, vcc +; CI-NEXT: v_mov_b32_e32 v1, v7 ; CI-NEXT: v_add_i32_e32 v0, vcc, v6, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; CI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc +; CI-NEXT: v_addc_u32_e32 v2, vcc, v8, v4, vcc ; CI-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -184,60 +188,64 @@ ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v0 -; SI-NEXT: v_mul_lo_u32 v11, v6, v1 -; SI-NEXT: v_mul_hi_u32 v12, v0, v1 ; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; SI-NEXT: v_mul_hi_u32 v14, v6, v1 -; SI-NEXT: v_mul_lo_u32 v13, v0, v7 -; SI-NEXT: v_mul_hi_u32 v10, v0, v7 -; SI-NEXT: v_add_i32_e32 v12, vcc, v11, v12 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_mul_hi_u32 v8, v6, v7 -; SI-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_mul_i32_i24_e32 v9, v6, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, v14, v10 -; SI-NEXT: v_mul_hi_i32 v6, v1, v6 -; SI-NEXT: v_mul_hi_i32 v7, v7, v0 -; SI-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc -; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc -; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v11 +; SI-NEXT: v_and_b32_e32 v9, v6, v1 +; SI-NEXT: v_and_b32_e32 v10, v7, v0 +; SI-NEXT: v_mul_lo_u32 v13, v6, v1 +; SI-NEXT: v_mul_hi_u32 v14, v0, v1 +; SI-NEXT: v_and_b32_e32 v8, v6, v7 +; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; SI-NEXT: v_mul_hi_u32 v10, v6, v7 +; SI-NEXT: v_mul_i32_i24_e32 v11, v6, v7 +; SI-NEXT: v_mul_hi_u32 v6, v6, v1 +; SI-NEXT: v_mul_hi_u32 v12, v0, v7 +; SI-NEXT: v_mul_lo_u32 v7, v0, v7 +; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v8, vcc +; SI-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; SI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, v11, v6 ; SI-NEXT: v_mul_lo_u32 v0, v0, v1 -; SI-NEXT: v_addc_u32_e32 v6, vcc, v7, v6, vcc -; SI-NEXT: v_add_i32_e32 v7, vcc, v9, v10 -; SI-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc +; SI-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc +; SI-NEXT: v_sub_i32_e32 v6, vcc, v6, v9 +; SI-NEXT: v_subb_u32_e32 v8, vcc, v10, v8, vcc ; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; SI-NEXT: v_addc_u32_e32 v1, vcc, v12, v3, vcc -; SI-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc -; SI-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc +; SI-NEXT: v_addc_u32_e32 v1, vcc, v7, v3, vcc +; SI-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; SI-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc ; SI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: mad_i64_i32_sextops_i32_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0 -; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v0 -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9] -; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, v11 -; GFX9-NEXT: v_mov_b32_e32 v11, v9 -; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 -; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc -; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9] -; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0 -; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13] -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v10 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v0 +; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v1, 0 +; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v1 +; GFX9-NEXT: v_and_b32_e32 v6, v14, v1 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-NEXT: v_and_b32_e32 v7, v14, v15 +; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v6 +; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v1, v[10:11] +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-NEXT: v_mov_b32_e32 v13, v11 +; GFX9-NEXT: v_mad_i64_i32 v[6:7], s[4:5], v15, v0, v[6:7] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v15, v[12:13] +; GFX9-NEXT: v_mov_b32_e32 v12, v1 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 +; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v15, v[10:11] +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v7, vcc +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v5, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: mad_i64_i32_sextops_i32_i128: @@ -246,27 +254,30 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_ashrrev_i32_e32 v14, 31, v0 -; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1 +; GFX11-NEXT: v_ashrrev_i32_e32 v16, 31, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v17, 31, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8] +; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v16, v1, v[7:8] ; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10] -; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0 -; GFX11-NEXT: v_mov_b32_e32 v8, v12 +; GFX11-NEXT: v_and_b32_e32 v8, v16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v17, v[9:10] +; GFX11-NEXT: v_and_b32_e32 v9, v16, v17 +; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, 0, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v1, v12 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10] -; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8] +; GFX11-NEXT: v_mad_i64_i32 v[14:15], null, v17, v0, v[8:9] +; GFX11-NEXT: v_add_co_u32 v12, s0, v7, v1 ; GFX11-NEXT: v_mov_b32_e32 v7, v11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12 -; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, 0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v17, v[12:13] +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v14 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v15, vcc_lo ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll --- a/llvm/test/CodeGen/PowerPC/pr45448.ll +++ b/llvm/test/CodeGen/PowerPC/pr45448.ll @@ -25,7 +25,8 @@ ; CHECK-NEXT: rldic r5, r5, 4, 32 ; CHECK-NEXT: crnot 4*cr5+lt, eq ; CHECK-NEXT: mulhdu r3, r3, r5 -; CHECK-NEXT: maddld r6, r4, r5, r3 +; CHECK-NEXT: and r6, r4, r5 +; CHECK-NEXT: sub r6, r3, r6 ; CHECK-NEXT: cmpld cr1, r6, r3 ; CHECK-NEXT: mulhdu. r3, r4, r5 ; CHECK-NEXT: bc 4, 4*cr5+lt, .LBB0_10 diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -1480,18 +1480,18 @@ ; RV32IM-NEXT: add a5, a6, a2 ; RV32IM-NEXT: mul a7, a1, a3 ; RV32IM-NEXT: add t0, a7, a5 -; RV32IM-NEXT: mul t1, a4, a0 -; RV32IM-NEXT: add a2, t0, t1 +; RV32IM-NEXT: and t1, a4, a0 +; RV32IM-NEXT: sub a2, t0, t1 ; RV32IM-NEXT: sltu t2, a2, t0 ; RV32IM-NEXT: sltu a7, t0, a7 ; RV32IM-NEXT: sltu a5, a5, a6 ; RV32IM-NEXT: mulhu a3, a1, a3 ; RV32IM-NEXT: add a3, a3, a5 ; RV32IM-NEXT: add a3, a3, a7 -; RV32IM-NEXT: mul a1, a4, a1 +; RV32IM-NEXT: and a1, a4, a1 ; RV32IM-NEXT: mulhu a0, a4, a0 -; RV32IM-NEXT: add a0, a0, a1 -; RV32IM-NEXT: add a0, a0, t1 +; RV32IM-NEXT: sub a0, a0, a1 +; RV32IM-NEXT: sub a0, a0, t1 ; RV32IM-NEXT: add a0, a3, a0 ; RV32IM-NEXT: add a1, a0, t2 ; RV32IM-NEXT: mv a0, a2 diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -991,8 +991,10 @@ ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: .cfi_offset s1, -8 +; RV32-NEXT: .cfi_offset s2, -12 ; RV32-NEXT: mulhu a5, a0, a2 ; RV32-NEXT: mul a6, a1, a2 ; RV32-NEXT: add a5, a6, a5 @@ -1008,33 +1010,34 @@ ; RV32-NEXT: mul t0, a1, a3 ; RV32-NEXT: add t1, t0, a7 ; RV32-NEXT: srai t2, a1, 31 -; RV32-NEXT: mul t3, a2, t2 +; RV32-NEXT: and t3, t2, a2 ; RV32-NEXT: srai t4, a3, 31 -; RV32-NEXT: mul t5, t4, a0 -; RV32-NEXT: add t6, t5, t3 -; RV32-NEXT: add s0, t1, t6 -; RV32-NEXT: sltu s1, s0, t1 +; RV32-NEXT: and t5, t4, a0 +; RV32-NEXT: neg t6, t5 +; RV32-NEXT: sub s0, t6, t3 +; RV32-NEXT: add s1, t1, s0 +; RV32-NEXT: sltu s2, s1, t1 ; RV32-NEXT: sltu t0, t1, t0 ; RV32-NEXT: sltu a6, a7, a6 ; RV32-NEXT: mulhu a7, a1, a3 ; RV32-NEXT: add a6, a7, a6 ; RV32-NEXT: add a6, a6, t0 ; RV32-NEXT: mulhu a7, a2, t2 -; RV32-NEXT: add a7, a7, t3 -; RV32-NEXT: mul a3, a3, t2 -; RV32-NEXT: add a3, a7, a3 -; RV32-NEXT: mul a1, t4, a1 +; RV32-NEXT: sub a7, a7, t3 +; RV32-NEXT: and a3, t2, a3 +; RV32-NEXT: sub a3, a7, a3 +; RV32-NEXT: and a1, t4, a1 ; RV32-NEXT: mulhu a7, t4, a0 -; RV32-NEXT: add a1, a7, a1 -; RV32-NEXT: add a1, a1, t5 +; RV32-NEXT: sub a1, a7, a1 +; RV32-NEXT: sub a1, a1, t5 ; RV32-NEXT: add a1, a1, a3 -; RV32-NEXT: sltu a3, t6, t5 +; RV32-NEXT: sltu a3, s0, t6 ; RV32-NEXT: add a1, a1, a3 ; RV32-NEXT: add a1, a6, a1 -; RV32-NEXT: add a1, a1, s1 +; RV32-NEXT: add a1, a1, s2 ; RV32-NEXT: srai a3, a5, 31 ; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: xor a3, s0, a3 +; RV32-NEXT: xor a3, s1, a3 ; RV32-NEXT: or a1, a3, a1 ; RV32-NEXT: snez a1, a1 ; RV32-NEXT: mul a0, a0, a2 @@ -1043,6 +1046,7 @@ ; RV32-NEXT: mv a0, a1 ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -1062,8 +1066,10 @@ ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s2, 4(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 ; RV32ZBA-NEXT: .cfi_offset s1, -8 +; RV32ZBA-NEXT: .cfi_offset s2, -12 ; RV32ZBA-NEXT: mulhu a5, a0, a2 ; RV32ZBA-NEXT: mul a6, a1, a2 ; RV32ZBA-NEXT: add a5, a6, a5 @@ -1079,33 +1085,34 @@ ; RV32ZBA-NEXT: mul t0, a1, a3 ; RV32ZBA-NEXT: add t1, t0, a7 ; RV32ZBA-NEXT: srai t2, a1, 31 -; RV32ZBA-NEXT: mul t3, a2, t2 +; RV32ZBA-NEXT: and t3, t2, a2 ; RV32ZBA-NEXT: srai t4, a3, 31 -; RV32ZBA-NEXT: mul t5, t4, a0 -; RV32ZBA-NEXT: add t6, t5, t3 -; RV32ZBA-NEXT: add s0, t1, t6 -; RV32ZBA-NEXT: sltu s1, s0, t1 +; RV32ZBA-NEXT: and t5, t4, a0 +; RV32ZBA-NEXT: neg t6, t5 +; RV32ZBA-NEXT: sub s0, t6, t3 +; RV32ZBA-NEXT: add s1, t1, s0 +; RV32ZBA-NEXT: sltu s2, s1, t1 ; RV32ZBA-NEXT: sltu t0, t1, t0 ; RV32ZBA-NEXT: sltu a6, a7, a6 ; RV32ZBA-NEXT: mulhu a7, a1, a3 ; RV32ZBA-NEXT: add a6, a7, a6 ; RV32ZBA-NEXT: add a6, a6, t0 ; RV32ZBA-NEXT: mulhu a7, a2, t2 -; RV32ZBA-NEXT: add a7, a7, t3 -; RV32ZBA-NEXT: mul a3, a3, t2 -; RV32ZBA-NEXT: add a3, a7, a3 -; RV32ZBA-NEXT: mul a1, t4, a1 +; RV32ZBA-NEXT: sub a7, a7, t3 +; RV32ZBA-NEXT: and a3, t2, a3 +; RV32ZBA-NEXT: sub a3, a7, a3 +; RV32ZBA-NEXT: and a1, t4, a1 ; RV32ZBA-NEXT: mulhu a7, t4, a0 -; RV32ZBA-NEXT: add a1, a7, a1 -; RV32ZBA-NEXT: add a1, a1, t5 +; RV32ZBA-NEXT: sub a1, a7, a1 +; RV32ZBA-NEXT: sub a1, a1, t5 ; RV32ZBA-NEXT: add a1, a1, a3 -; RV32ZBA-NEXT: sltu a3, t6, t5 +; RV32ZBA-NEXT: sltu a3, s0, t6 ; RV32ZBA-NEXT: add a1, a1, a3 ; RV32ZBA-NEXT: add a1, a6, a1 -; RV32ZBA-NEXT: add a1, a1, s1 +; RV32ZBA-NEXT: add a1, a1, s2 ; RV32ZBA-NEXT: srai a3, a5, 31 ; RV32ZBA-NEXT: xor a1, a1, a3 -; RV32ZBA-NEXT: xor a3, s0, a3 +; RV32ZBA-NEXT: xor a3, s1, a3 ; RV32ZBA-NEXT: or a1, a3, a1 ; RV32ZBA-NEXT: snez a1, a1 ; RV32ZBA-NEXT: mul a0, a0, a2 @@ -1114,6 +1121,7 @@ ; RV32ZBA-NEXT: mv a0, a1 ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s2, 4(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -1145,8 +1153,8 @@ ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: srai a1, a1, 31 -; RV32-NEXT: mul a6, a1, a3 -; RV32-NEXT: add a6, a5, a6 +; RV32-NEXT: andi a6, a1, 13 +; RV32-NEXT: sub a6, a5, a6 ; RV32-NEXT: srai a7, a4, 31 ; RV32-NEXT: xor t0, a6, a7 ; RV32-NEXT: sltu a5, a6, a5 @@ -1182,8 +1190,8 @@ ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: srai a1, a1, 31 -; RV32ZBA-NEXT: mul a6, a1, a3 -; RV32ZBA-NEXT: add a6, a5, a6 +; RV32ZBA-NEXT: andi a6, a1, 13 +; RV32ZBA-NEXT: sub a6, a5, a6 ; RV32ZBA-NEXT: srai a7, a4, 31 ; RV32ZBA-NEXT: xor t0, a6, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 @@ -2414,7 +2422,9 @@ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 ; RV32-NEXT: add a4, a5, a4 @@ -2430,33 +2440,34 @@ ; RV32-NEXT: mul a7, a1, a3 ; RV32-NEXT: add t0, a7, a6 ; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: mul t2, a2, t1 +; RV32-NEXT: and t2, t1, a2 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: mul t4, t3, a0 -; RV32-NEXT: add t5, t4, t2 -; RV32-NEXT: add t6, t0, t5 -; RV32-NEXT: sltu s0, t6, t0 +; RV32-NEXT: and t4, t3, a0 +; RV32-NEXT: neg t5, t4 +; RV32-NEXT: sub t6, t5, t2 +; RV32-NEXT: add s0, t0, t6 +; RV32-NEXT: sltu s1, s0, t0 ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a6, a2, t1 -; RV32-NEXT: add a6, a6, t2 -; RV32-NEXT: mul a7, a3, t1 -; RV32-NEXT: add a6, a6, a7 -; RV32-NEXT: mul a7, t3, a1 +; RV32-NEXT: sub a6, a6, t2 +; RV32-NEXT: and a7, t1, a3 +; RV32-NEXT: sub a6, a6, a7 +; RV32-NEXT: and a7, t3, a1 ; RV32-NEXT: mulhu t0, t3, a0 -; RV32-NEXT: add a7, t0, a7 -; RV32-NEXT: add a7, a7, t4 +; RV32-NEXT: sub a7, t0, a7 +; RV32-NEXT: sub a7, a7, t4 ; RV32-NEXT: add a6, a7, a6 -; RV32-NEXT: sltu a7, t5, t4 +; RV32-NEXT: sltu a7, t6, t5 ; RV32-NEXT: add a6, a6, a7 ; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: add a5, a5, s0 +; RV32-NEXT: add a5, a5, s1 ; RV32-NEXT: srai a4, a4, 31 ; RV32-NEXT: xor a5, a5, a4 -; RV32-NEXT: xor a4, t6, a4 +; RV32-NEXT: xor a4, s0, a4 ; RV32-NEXT: or a4, a4, a5 ; RV32-NEXT: bnez a4, .LBB46_2 ; RV32-NEXT: # %bb.1: # %entry @@ -2464,6 +2475,7 @@ ; RV32-NEXT: mv a1, a3 ; RV32-NEXT: .LBB46_2: # %entry ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -2483,7 +2495,9 @@ ; RV32ZBA-NEXT: addi sp, sp, -16 ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 +; RV32ZBA-NEXT: .cfi_offset s1, -8 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 ; RV32ZBA-NEXT: add a4, a5, a4 @@ -2499,33 +2513,34 @@ ; RV32ZBA-NEXT: mul a7, a1, a3 ; RV32ZBA-NEXT: add t0, a7, a6 ; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: mul t2, a2, t1 +; RV32ZBA-NEXT: and t2, t1, a2 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: mul t4, t3, a0 -; RV32ZBA-NEXT: add t5, t4, t2 -; RV32ZBA-NEXT: add t6, t0, t5 -; RV32ZBA-NEXT: sltu s0, t6, t0 +; RV32ZBA-NEXT: and t4, t3, a0 +; RV32ZBA-NEXT: neg t5, t4 +; RV32ZBA-NEXT: sub t6, t5, t2 +; RV32ZBA-NEXT: add s0, t0, t6 +; RV32ZBA-NEXT: sltu s1, s0, t0 ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a6, a2, t1 -; RV32ZBA-NEXT: add a6, a6, t2 -; RV32ZBA-NEXT: mul a7, a3, t1 -; RV32ZBA-NEXT: add a6, a6, a7 -; RV32ZBA-NEXT: mul a7, t3, a1 +; RV32ZBA-NEXT: sub a6, a6, t2 +; RV32ZBA-NEXT: and a7, t1, a3 +; RV32ZBA-NEXT: sub a6, a6, a7 +; RV32ZBA-NEXT: and a7, t3, a1 ; RV32ZBA-NEXT: mulhu t0, t3, a0 -; RV32ZBA-NEXT: add a7, t0, a7 -; RV32ZBA-NEXT: add a7, a7, t4 +; RV32ZBA-NEXT: sub a7, t0, a7 +; RV32ZBA-NEXT: sub a7, a7, t4 ; RV32ZBA-NEXT: add a6, a7, a6 -; RV32ZBA-NEXT: sltu a7, t5, t4 +; RV32ZBA-NEXT: sltu a7, t6, t5 ; RV32ZBA-NEXT: add a6, a6, a7 ; RV32ZBA-NEXT: add a5, a5, a6 -; RV32ZBA-NEXT: add a5, a5, s0 +; RV32ZBA-NEXT: add a5, a5, s1 ; RV32ZBA-NEXT: srai a4, a4, 31 ; RV32ZBA-NEXT: xor a5, a5, a4 -; RV32ZBA-NEXT: xor a4, t6, a4 +; RV32ZBA-NEXT: xor a4, s0, a4 ; RV32ZBA-NEXT: or a4, a4, a5 ; RV32ZBA-NEXT: bnez a4, .LBB46_2 ; RV32ZBA-NEXT: # %bb.1: # %entry @@ -2533,6 +2548,7 @@ ; RV32ZBA-NEXT: mv a1, a3 ; RV32ZBA-NEXT: .LBB46_2: # %entry ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -2559,7 +2575,9 @@ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 ; RV32-NEXT: add a4, a5, a4 @@ -2575,36 +2593,38 @@ ; RV32-NEXT: mul a7, a1, a3 ; RV32-NEXT: add t0, a7, a6 ; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: mul t2, a2, t1 +; RV32-NEXT: and t2, t1, a2 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: mul t4, t3, a0 -; RV32-NEXT: add t5, t4, t2 -; RV32-NEXT: add t6, t0, t5 -; RV32-NEXT: sltu s0, t6, t0 +; RV32-NEXT: and t4, t3, a0 +; RV32-NEXT: neg t5, t4 +; RV32-NEXT: sub t6, t5, t2 +; RV32-NEXT: add s0, t0, t6 +; RV32-NEXT: sltu s1, s0, t0 ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a2, a2, t1 -; RV32-NEXT: add a2, a2, t2 -; RV32-NEXT: mul a3, a3, t1 -; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: mul a1, t3, a1 +; RV32-NEXT: sub a2, a2, t2 +; RV32-NEXT: and a3, t1, a3 +; RV32-NEXT: sub a2, a2, a3 +; RV32-NEXT: and a1, t3, a1 ; RV32-NEXT: mulhu a0, t3, a0 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a0, t4 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: sub a0, a0, t4 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: sltu a1, t5, t4 +; RV32-NEXT: sltu a1, t6, t5 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add a0, a5, a0 -; RV32-NEXT: add a0, a0, s0 +; RV32-NEXT: add a0, a0, s1 ; RV32-NEXT: srai a1, a4, 31 ; RV32-NEXT: xor a0, a0, a1 -; RV32-NEXT: xor a1, t6, a1 +; RV32-NEXT: xor a1, s0, a1 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -2622,7 +2642,9 @@ ; RV32ZBA-NEXT: addi sp, sp, -16 ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 +; RV32ZBA-NEXT: .cfi_offset s1, -8 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 ; RV32ZBA-NEXT: add a4, a5, a4 @@ -2638,36 +2660,38 @@ ; RV32ZBA-NEXT: mul a7, a1, a3 ; RV32ZBA-NEXT: add t0, a7, a6 ; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: mul t2, a2, t1 +; RV32ZBA-NEXT: and t2, t1, a2 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: mul t4, t3, a0 -; RV32ZBA-NEXT: add t5, t4, t2 -; RV32ZBA-NEXT: add t6, t0, t5 -; RV32ZBA-NEXT: sltu s0, t6, t0 +; RV32ZBA-NEXT: and t4, t3, a0 +; RV32ZBA-NEXT: neg t5, t4 +; RV32ZBA-NEXT: sub t6, t5, t2 +; RV32ZBA-NEXT: add s0, t0, t6 +; RV32ZBA-NEXT: sltu s1, s0, t0 ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a2, a2, t1 -; RV32ZBA-NEXT: add a2, a2, t2 -; RV32ZBA-NEXT: mul a3, a3, t1 -; RV32ZBA-NEXT: add a2, a2, a3 -; RV32ZBA-NEXT: mul a1, t3, a1 +; RV32ZBA-NEXT: sub a2, a2, t2 +; RV32ZBA-NEXT: and a3, t1, a3 +; RV32ZBA-NEXT: sub a2, a2, a3 +; RV32ZBA-NEXT: and a1, t3, a1 ; RV32ZBA-NEXT: mulhu a0, t3, a0 -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: add a0, a0, t4 +; RV32ZBA-NEXT: sub a0, a0, a1 +; RV32ZBA-NEXT: sub a0, a0, t4 ; RV32ZBA-NEXT: add a0, a0, a2 -; RV32ZBA-NEXT: sltu a1, t5, t4 +; RV32ZBA-NEXT: sltu a1, t6, t5 ; RV32ZBA-NEXT: add a0, a0, a1 ; RV32ZBA-NEXT: add a0, a5, a0 -; RV32ZBA-NEXT: add a0, a0, s0 +; RV32ZBA-NEXT: add a0, a0, s1 ; RV32ZBA-NEXT: srai a1, a4, 31 ; RV32ZBA-NEXT: xor a0, a0, a1 -; RV32ZBA-NEXT: xor a1, t6, a1 +; RV32ZBA-NEXT: xor a1, s0, a1 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: seqz a0, a0 ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -3529,7 +3553,9 @@ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 +; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: mulhu a4, a0, a2 ; RV32-NEXT: mul a5, a1, a2 ; RV32-NEXT: add a4, a5, a4 @@ -3545,33 +3571,34 @@ ; RV32-NEXT: mul a7, a1, a3 ; RV32-NEXT: add t0, a7, a6 ; RV32-NEXT: srai t1, a1, 31 -; RV32-NEXT: mul t2, a2, t1 +; RV32-NEXT: and t2, t1, a2 ; RV32-NEXT: srai t3, a3, 31 -; RV32-NEXT: mul t4, t3, a0 -; RV32-NEXT: add t5, t4, t2 -; RV32-NEXT: add t6, t0, t5 -; RV32-NEXT: sltu s0, t6, t0 +; RV32-NEXT: and t4, t3, a0 +; RV32-NEXT: neg t5, t4 +; RV32-NEXT: sub t6, t5, t2 +; RV32-NEXT: add s0, t0, t6 +; RV32-NEXT: sltu s1, s0, t0 ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 ; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a2, a2, t1 -; RV32-NEXT: add a2, a2, t2 -; RV32-NEXT: mul a3, a3, t1 -; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: mul a1, t3, a1 +; RV32-NEXT: sub a2, a2, t2 +; RV32-NEXT: and a3, t1, a3 +; RV32-NEXT: sub a2, a2, a3 +; RV32-NEXT: and a1, t3, a1 ; RV32-NEXT: mulhu a0, t3, a0 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add a0, a0, t4 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: sub a0, a0, t4 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: sltu a1, t5, t4 +; RV32-NEXT: sltu a1, t6, t5 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add a0, a5, a0 -; RV32-NEXT: add a0, a0, s0 +; RV32-NEXT: add a0, a0, s1 ; RV32-NEXT: srai a1, a4, 31 ; RV32-NEXT: xor a0, a0, a1 -; RV32-NEXT: xor a1, t6, a1 +; RV32-NEXT: xor a1, s0, a1 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: beqz a0, .LBB61_2 ; RV32-NEXT: # %bb.1: # %overflow @@ -3581,6 +3608,7 @@ ; RV32-NEXT: li a0, 1 ; RV32-NEXT: .LBB61_3: # %overflow ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; @@ -3602,7 +3630,9 @@ ; RV32ZBA-NEXT: addi sp, sp, -16 ; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 ; RV32ZBA-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32ZBA-NEXT: .cfi_offset s0, -4 +; RV32ZBA-NEXT: .cfi_offset s1, -8 ; RV32ZBA-NEXT: mulhu a4, a0, a2 ; RV32ZBA-NEXT: mul a5, a1, a2 ; RV32ZBA-NEXT: add a4, a5, a4 @@ -3618,33 +3648,34 @@ ; RV32ZBA-NEXT: mul a7, a1, a3 ; RV32ZBA-NEXT: add t0, a7, a6 ; RV32ZBA-NEXT: srai t1, a1, 31 -; RV32ZBA-NEXT: mul t2, a2, t1 +; RV32ZBA-NEXT: and t2, t1, a2 ; RV32ZBA-NEXT: srai t3, a3, 31 -; RV32ZBA-NEXT: mul t4, t3, a0 -; RV32ZBA-NEXT: add t5, t4, t2 -; RV32ZBA-NEXT: add t6, t0, t5 -; RV32ZBA-NEXT: sltu s0, t6, t0 +; RV32ZBA-NEXT: and t4, t3, a0 +; RV32ZBA-NEXT: neg t5, t4 +; RV32ZBA-NEXT: sub t6, t5, t2 +; RV32ZBA-NEXT: add s0, t0, t6 +; RV32ZBA-NEXT: sltu s1, s0, t0 ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 ; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a2, a2, t1 -; RV32ZBA-NEXT: add a2, a2, t2 -; RV32ZBA-NEXT: mul a3, a3, t1 -; RV32ZBA-NEXT: add a2, a2, a3 -; RV32ZBA-NEXT: mul a1, t3, a1 +; RV32ZBA-NEXT: sub a2, a2, t2 +; RV32ZBA-NEXT: and a3, t1, a3 +; RV32ZBA-NEXT: sub a2, a2, a3 +; RV32ZBA-NEXT: and a1, t3, a1 ; RV32ZBA-NEXT: mulhu a0, t3, a0 -; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: add a0, a0, t4 +; RV32ZBA-NEXT: sub a0, a0, a1 +; RV32ZBA-NEXT: sub a0, a0, t4 ; RV32ZBA-NEXT: add a0, a0, a2 -; RV32ZBA-NEXT: sltu a1, t5, t4 +; RV32ZBA-NEXT: sltu a1, t6, t5 ; RV32ZBA-NEXT: add a0, a0, a1 ; RV32ZBA-NEXT: add a0, a5, a0 -; RV32ZBA-NEXT: add a0, a0, s0 +; RV32ZBA-NEXT: add a0, a0, s1 ; RV32ZBA-NEXT: srai a1, a4, 31 ; RV32ZBA-NEXT: xor a0, a0, a1 -; RV32ZBA-NEXT: xor a1, t6, a1 +; RV32ZBA-NEXT: xor a1, s0, a1 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: beqz a0, .LBB61_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow @@ -3654,6 +3685,7 @@ ; RV32ZBA-NEXT: li a0, 1 ; RV32ZBA-NEXT: .LBB61_3: # %overflow ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret ; @@ -3701,8 +3733,8 @@ ; RV32-NEXT: add a6, a4, a6 ; RV32-NEXT: sub t1, a6, a1 ; RV32-NEXT: srai t2, a1, 31 -; RV32-NEXT: mul t3, t2, a2 -; RV32-NEXT: sub t3, t3, a0 +; RV32-NEXT: andi t3, t2, -13 +; RV32-NEXT: sub t3, a5, t3 ; RV32-NEXT: add t4, t1, t3 ; RV32-NEXT: sltu t5, t4, t1 ; RV32-NEXT: neg t6, a1 @@ -3763,8 +3795,8 @@ ; RV32ZBA-NEXT: add a6, a4, a6 ; RV32ZBA-NEXT: sub t1, a6, a1 ; RV32ZBA-NEXT: srai t2, a1, 31 -; RV32ZBA-NEXT: mul t3, t2, a2 -; RV32ZBA-NEXT: sub t3, t3, a0 +; RV32ZBA-NEXT: andi t3, t2, -13 +; RV32ZBA-NEXT: sub t3, a5, t3 ; RV32ZBA-NEXT: add t4, t1, t3 ; RV32ZBA-NEXT: sltu t5, t4, t1 ; RV32ZBA-NEXT: neg t6, a1 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll --- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll @@ -38,22 +38,23 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_0246_ext0(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0246_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r1, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull lr, r12, r1, r0 -; CHECK-NEXT: umull r2, r5, r3, r0 +; CHECK-NEXT: umull r2, r4, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: mla r4, r1, r2, r12 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r2, r3, r2, r5 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: mla r1, r1, r0, r4 -; CHECK-NEXT: mla r0, r3, r0, r2 +; CHECK-NEXT: and.w r2, r1, r0, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r0, r1, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r3, r0, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r0, r0, r3, asr #31 +; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -67,22 +68,23 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_0246(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_ext0_0246: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: asrs r4, r0, #31 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull lr, r12, r0, r1 -; CHECK-NEXT: umull r2, r5, r0, r3 +; CHECK-NEXT: umull r2, r4, r0, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: mla r2, r0, r2, r12 -; CHECK-NEXT: mla r1, r4, r1, r2 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: mla r0, r0, r2, r5 -; CHECK-NEXT: mla r0, r4, r3, r0 +; CHECK-NEXT: and.w r2, r0, r1, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r1, r0, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r0, r3, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r0, r3, r0, asr #31 +; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -130,23 +132,24 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_1357_ext0(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_1357_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vrev64.32 q1, q0 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r1, r0 -; CHECK-NEXT: umull r2, r5, r3, r0 +; CHECK-NEXT: umull r2, r4, r3, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: mla r4, r1, r2, r12 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r2, r3, r2, r5 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: mla r1, r1, r0, r4 -; CHECK-NEXT: mla r0, r3, r0, r2 +; CHECK-NEXT: and.w r2, r1, r0, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r0, r1, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r3, r0, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r0, r0, r3, asr #31 +; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -160,23 +163,24 @@ define arm_aapcs_vfpcc <2 x i64> @sext32_ext0_1357(<4 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_ext0_1357: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vrev64.32 q1, q0 -; CHECK-NEXT: asrs r4, r0, #31 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r0, r1 -; CHECK-NEXT: umull r2, r5, r0, r3 +; CHECK-NEXT: umull r2, r4, r0, r3 ; CHECK-NEXT: vmov q0[2], q0[0], r2, lr -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: mla r2, r0, r2, r12 -; CHECK-NEXT: mla r1, r4, r1, r2 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: mla r0, r0, r2, r5 -; CHECK-NEXT: mla r0, r4, r3, r0 +; CHECK-NEXT: and.w r2, r0, r1, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r1, r0, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r0, r3, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r0, r3, r0, asr #31 +; CHECK-NEXT: subs r0, r2, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r1 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> %out1 = sext <2 x i32> %shuf1 to <2 x i64> @@ -230,36 +234,39 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_0213_ext0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov.f32 s4, s1 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r2, r5, r3, r0 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r1, r0 +; CHECK-NEXT: umull r2, r4, r3, r0 ; CHECK-NEXT: vmov q1[2], q1[0], r2, lr -; CHECK-NEXT: asrs r2, r0, #31 -; CHECK-NEXT: mla r4, r1, r2, r12 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r5, r3, r2, r5 -; CHECK-NEXT: asrs r3, r3, #31 -; CHECK-NEXT: mla r1, r1, r0, r4 -; CHECK-NEXT: mla r3, r3, r0, r5 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r1 +; CHECK-NEXT: and.w r2, r1, r0, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r0, r1, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r3, r0, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r3, r0, r3, asr #31 +; CHECK-NEXT: subs r2, r2, r3 +; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: umull r3, r5, r1, r0 -; CHECK-NEXT: mla r5, r1, r2, r5 -; CHECK-NEXT: asrs r1, r1, #31 -; CHECK-NEXT: mla r12, r1, r0, r5 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: umull r4, r1, r5, r0 -; CHECK-NEXT: mla r1, r5, r2, r1 -; CHECK-NEXT: asrs r2, r5, #31 +; CHECK-NEXT: and.w r2, r1, r0, asr #31 +; CHECK-NEXT: umull r3, r4, r1, r0 +; CHECK-NEXT: and.w r1, r0, r1, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sub.w r12, r2, r1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: umull r4, r1, r2, r0 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 -; CHECK-NEXT: mla r0, r2, r0, r1 +; CHECK-NEXT: and.w r3, r2, r0, asr #31 +; CHECK-NEXT: and.w r0, r0, r2, asr #31 +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: subs r0, r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> %out1 = sext <4 x i32> %shuf1 to <4 x i64> @@ -273,36 +280,39 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) { ; CHECK-LABEL: sext32_ext0_0213: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: asrs r4, r0, #31 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: vmov.f32 s6, s3 -; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.f32 s4, s1 ; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: umull r2, r5, r0, r3 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: umull lr, r12, r0, r1 +; CHECK-NEXT: umull r2, r4, r0, r3 ; CHECK-NEXT: vmov q1[2], q1[0], r2, lr -; CHECK-NEXT: asrs r2, r1, #31 -; CHECK-NEXT: mla r2, r0, r2, r12 -; CHECK-NEXT: mla r1, r4, r1, r2 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: mla r2, r0, r2, r5 -; CHECK-NEXT: mla r2, r4, r3, r2 +; CHECK-NEXT: and.w r2, r0, r1, asr #31 +; CHECK-NEXT: sub.w r2, r12, r2 +; CHECK-NEXT: and.w r1, r1, r0, asr #31 +; CHECK-NEXT: subs r1, r2, r1 +; CHECK-NEXT: and.w r2, r0, r3, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: and.w r3, r3, r0, asr #31 +; CHECK-NEXT: subs r2, r2, r3 ; CHECK-NEXT: vmov q1[3], q1[1], r2, r1 ; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: umull r2, r3, r0, r1 -; CHECK-NEXT: asrs r5, r1, #31 -; CHECK-NEXT: mla r3, r0, r5, r3 -; CHECK-NEXT: mla r12, r4, r1, r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: umull r5, r1, r0, r3 -; CHECK-NEXT: vmov q0[2], q0[0], r5, r2 -; CHECK-NEXT: asrs r2, r3, #31 -; CHECK-NEXT: mla r0, r0, r2, r1 -; CHECK-NEXT: mla r0, r4, r3, r0 +; CHECK-NEXT: umull r3, r4, r0, r1 +; CHECK-NEXT: and.w r2, r0, r1, asr #31 +; CHECK-NEXT: and.w r1, r1, r0, asr #31 +; CHECK-NEXT: subs r2, r4, r2 +; CHECK-NEXT: sub.w r12, r2, r1 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: umull r4, r1, r0, r2 +; CHECK-NEXT: vmov q0[2], q0[0], r4, r3 +; CHECK-NEXT: and.w r3, r0, r2, asr #31 +; CHECK-NEXT: and.w r0, r2, r0, asr #31 +; CHECK-NEXT: subs r1, r1, r3 +; CHECK-NEXT: subs r0, r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> %out1 = sext <4 x i32> %shuf1 to <4 x i64> diff --git a/llvm/test/CodeGen/X86/extmul128.ll b/llvm/test/CodeGen/X86/extmul128.ll --- a/llvm/test/CodeGen/X86/extmul128.ll +++ b/llvm/test/CodeGen/X86/extmul128.ll @@ -29,8 +29,8 @@ ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: mulq %rsi ; CHECK-NEXT: sarq $63, %rsi -; CHECK-NEXT: imulq %rdi, %rsi -; CHECK-NEXT: addq %rsi, %rdx +; CHECK-NEXT: andq %rdi, %rsi +; CHECK-NEXT: subq %rsi, %rdx ; CHECK-NEXT: retq %aa = zext i64 %a to i128 %bb = sext i64 %b to i128 @@ -45,8 +45,8 @@ ; CHECK-NEXT: movq %rdi, %rcx ; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: mulq %rsi -; CHECK-NEXT: imulq %rsi, %rcx -; CHECK-NEXT: addq %rcx, %rdx +; CHECK-NEXT: andq %rsi, %rcx +; CHECK-NEXT: subq %rcx, %rdx ; CHECK-NEXT: retq %aa = sext i64 %a to i128 %bb = zext i64 %b to i128 diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll --- a/llvm/test/CodeGen/X86/muloti.ll +++ b/llvm/test/CodeGen/X86/muloti.ll @@ -11,46 +11,51 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: pushq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: pushq %r12 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset %rbx, -32 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: .cfi_offset %rbx, -40 +; CHECK-NEXT: .cfi_offset %r12, -32 ; CHECK-NEXT: .cfi_offset %r14, -24 ; CHECK-NEXT: .cfi_offset %r15, -16 ; CHECK-NEXT: movq %rdx, %r11 ; CHECK-NEXT: movq %rsi, %r9 -; CHECK-NEXT: movq %rdi, %r15 +; CHECK-NEXT: movq %rdi, %r14 ; CHECK-NEXT: sarq $63, %rsi -; CHECK-NEXT: movq %rdx, %rdi -; CHECK-NEXT: imulq %rsi, %rdi +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: andq %rdx, %rdi ; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: mulq %rsi ; CHECK-NEXT: movq %rax, %r8 -; CHECK-NEXT: addq %rdi, %rdx -; CHECK-NEXT: imulq %rcx, %rsi -; CHECK-NEXT: addq %rdx, %rsi -; CHECK-NEXT: movq %rcx, %rdi -; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: imulq %r9, %rbx -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: mulq %r15 +; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: subq %rdi, %rbx +; CHECK-NEXT: andq %rcx, %rsi +; CHECK-NEXT: subq %rsi, %rbx +; CHECK-NEXT: movq %rcx, %rsi +; CHECK-NEXT: sarq $63, %rsi +; CHECK-NEXT: movq %rsi, %rdi +; CHECK-NEXT: andq %r9, %rdi +; CHECK-NEXT: movq %rsi, %rax +; CHECK-NEXT: mulq %r14 ; CHECK-NEXT: movq %rax, %r10 -; CHECK-NEXT: addq %rbx, %rdx -; CHECK-NEXT: imulq %r15, %rdi -; CHECK-NEXT: addq %rdx, %rdi +; CHECK-NEXT: movq %rdx, %r12 +; CHECK-NEXT: subq %rdi, %r12 +; CHECK-NEXT: andq %r14, %rsi +; CHECK-NEXT: subq %rsi, %r12 ; CHECK-NEXT: addq %r8, %r10 -; CHECK-NEXT: adcq %rsi, %rdi -; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: adcq %rbx, %r12 +; CHECK-NEXT: movq %r14, %rax ; CHECK-NEXT: mulq %r11 -; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movq %rdx, %r15 ; CHECK-NEXT: movq %rax, %r8 ; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: mulq %r11 ; CHECK-NEXT: movq %rdx, %rbx ; CHECK-NEXT: movq %rax, %rsi -; CHECK-NEXT: addq %r14, %rsi +; CHECK-NEXT: addq %r15, %rsi ; CHECK-NEXT: adcq $0, %rbx -; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: movq %r14, %rax ; CHECK-NEXT: mulq %rcx ; CHECK-NEXT: movq %rdx, %r14 ; CHECK-NEXT: movq %rax, %r11 @@ -63,7 +68,7 @@ ; CHECK-NEXT: addq %r14, %rax ; CHECK-NEXT: adcq %rsi, %rdx ; CHECK-NEXT: addq %r10, %rax -; CHECK-NEXT: adcq %rdi, %rdx +; CHECK-NEXT: adcq %r12, %rdx ; CHECK-NEXT: movq %r11, %rcx ; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: xorq %rcx, %rdx @@ -74,6 +79,7 @@ ; CHECK-NEXT: movq %r8, %rax ; CHECK-NEXT: movq %r11, %rdx ; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -369,8 +369,8 @@ ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $8, %esp -; X86-NEXT: .cfi_def_cfa_offset 28 +; X86-NEXT: subl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 32 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 @@ -378,52 +378,54 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: imull %ebx, %edi -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: addl %edi, %edx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: andl %eax, %ebx +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: subl %ebx, %esi +; X86-NEXT: andl %ebp, %edi +; X86-NEXT: subl %edi, %esi ; X86-NEXT: movl %ebp, %edi -; X86-NEXT: imull %ebp, %ebx -; X86-NEXT: addl %edx, %ebx ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl %edi, %ebp -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: andl %ecx, %ebp ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: addl %ebp, %edx -; X86-NEXT: imull %esi, %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %ebp, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %edx, %edi +; X86-NEXT: subl %edi, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: movl %edx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %bl +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl %bl, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl %ebx, %edx ; X86-NEXT: movl %ebp, %edi ; X86-NEXT: sarl $31, %edi ; X86-NEXT: xorl %edi, %edx @@ -434,11 +436,11 @@ ; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF ; X86-NEXT: orl %edx, %edi ; X86-NEXT: notl %ecx -; X86-NEXT: cmovel (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: cmovel %ebp, %esi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %esi, %edx -; X86-NEXT: addl $8, %esp +; X86-NEXT: addl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -9,49 +9,55 @@ ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: pushq %r14 ; X64-NEXT: .cfi_def_cfa_offset 24 -; X64-NEXT: pushq %r12 +; X64-NEXT: pushq %r13 ; X64-NEXT: .cfi_def_cfa_offset 32 -; X64-NEXT: pushq %rbx +; X64-NEXT: pushq %r12 ; X64-NEXT: .cfi_def_cfa_offset 40 -; X64-NEXT: .cfi_offset %rbx, -40 -; X64-NEXT: .cfi_offset %r12, -32 +; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 48 +; X64-NEXT: .cfi_offset %rbx, -48 +; X64-NEXT: .cfi_offset %r12, -40 +; X64-NEXT: .cfi_offset %r13, -32 ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 -; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rsi, %r10 -; X64-NEXT: movq %rdi, %r15 -; X64-NEXT: sarq $63, %rsi -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: imulq %rsi, %rdi -; X64-NEXT: movq %rdx, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: imulq %rcx, %rsi -; X64-NEXT: addq %rdx, %rsi -; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movq %rsi, %rdi ; X64-NEXT: sarq $63, %rdi ; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: imulq %r10, %rbx -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: andq %rdx, %rbx +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: subq %rbx, %rsi +; X64-NEXT: andq %rcx, %rdi +; X64-NEXT: subq %rdi, %rsi +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq %rbx, %rdi +; X64-NEXT: andq %r10, %rdi +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rbx, %rdx -; X64-NEXT: imulq %r15, %rdi -; X64-NEXT: addq %rdx, %rdi +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: subq %rdi, %r13 +; X64-NEXT: andq %r14, %rbx +; X64-NEXT: subq %rbx, %r13 ; X64-NEXT: addq %r9, %r11 -; X64-NEXT: adcq %rsi, %rdi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: adcq %rsi, %r13 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r14, %rsi +; X64-NEXT: addq %r12, %rsi ; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r14 @@ -64,7 +70,7 @@ ; X64-NEXT: addq %r15, %rax ; X64-NEXT: adcq %rsi, %rdx ; X64-NEXT: addq %r11, %rax -; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: adcq %r13, %rdx ; X64-NEXT: movq %r14, 8(%r8) ; X64-NEXT: sarq $63, %r14 ; X64-NEXT: xorq %r14, %rdx @@ -74,6 +80,7 @@ ; X64-NEXT: movq %r9, (%r8) ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 +; X64-NEXT: popq %r13 ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 ; X64-NEXT: retq @@ -88,8 +95,8 @@ ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $56, %esp -; X86-NEXT: .cfi_def_cfa_offset 76 +; X86-NEXT: subl $60, %esp +; X86-NEXT: .cfi_def_cfa_offset 80 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 @@ -103,226 +110,229 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi +; X86-NEXT: addl %esi, %edi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl (%esp), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %esi +; X86-NEXT: adcl %ebx, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: setb (%esp) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb (%esp) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, %esi -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl %edi, %esi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: imull %esi, %ebx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: imull %esi, %ebp -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill +; X86-NEXT: subl %esi, %edi +; X86-NEXT: andl %ecx, %ebx +; X86-NEXT: subl %ebx, %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: andl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: subl %esi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: subl %eax, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl %esi, (%esp) ## 4-byte Folded Spill ; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl (%esp), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %edi +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %esi -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: imull %ebx, %edi -; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sarl $31, %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %ebx, %ecx -; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl %eax, %ebx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl %eax, %esi +; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: subl %esi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %ebx, %eax -; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %eax, %edi -; X86-NEXT: adcl %edx, %esi -; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: andl %edi, %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %eax -; X86-NEXT: xorl %edx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: xorl %ecx, %ebx +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: xorl %ecx, %ebp +; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -330,7 +340,7 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: setne %al -; X86-NEXT: addl $56, %esp +; X86-NEXT: addl $60, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -364,228 +374,233 @@ ; X64-NEXT: .cfi_offset %r14, -32 ; X64-NEXT: .cfi_offset %r15, -24 ; X64-NEXT: .cfi_offset %rbp, -16 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rsi, %r14 ; X64-NEXT: movq %rdx, %rax +; X64-NEXT: movq %rsi, %r15 +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rdi, %r11 +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r10, %rbp +; X64-NEXT: addq %rbx, %rbp ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rbp, %r12 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %rbp, %r13 ; X64-NEXT: adcq %rsi, %rbx ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %esi ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r9, %rcx ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rbx, %r10 -; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: adcq %rsi, %r9 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rsi, %rbx ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r13 -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %r8 +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: addq %rbx, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: adcq %rbp, %rdi ; X64-NEXT: setb %bl -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %rdi, %rsi ; X64-NEXT: movzbl %bl, %eax -; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; X64-NEXT: addq %r11, %rsi -; X64-NEXT: adcq %r12, %r9 +; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; X64-NEXT: addq %r12, %rsi +; X64-NEXT: adcq %r13, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r13, %rbx -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r8, %rbp +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rdi, %rcx -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r13 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq %rbp, %rdi -; X64-NEXT: setb %r11b -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rdi, %rbp -; X64-NEXT: movzbl %r11b, %eax +; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: adcq %rbx, %rdi +; X64-NEXT: setb %r12b +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: movzbl %r12b, %eax ; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: addq %rsi, %r12 -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r9, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq $0, %rbp +; X64-NEXT: addq %rsi, %r13 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r14, %rbp +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq $0, %rbx ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: addq %r10, %rbp -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload -; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill +; X64-NEXT: addq %r10, %rbx +; X64-NEXT: adcq %r9, %rcx +; X64-NEXT: setb %r15b ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r10 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r10, %rbx +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r9, %rbp ; X64-NEXT: adcq $0, %rdi ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r13 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rbp, %rax +; X64-NEXT: movq %rax, %r9 ; X64-NEXT: adcq %rdi, %rsi ; X64-NEXT: setb %dil ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %rsi, %r12 ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbp, %r9 +; X64-NEXT: adcq %rax, %rdx +; X64-NEXT: addq %rbx, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %rcx, %r9 ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %rcx, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 1-byte Folded Reload -; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: adcq $0, %r12 +; X64-NEXT: movzbl %r15b, %eax +; X64-NEXT: adcq %rax, %r12 +; X64-NEXT: adcq $0, %rdx +; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movq %r14, %rsi ; X64-NEXT: sarq $63, %rsi -; X64-NEXT: movq %rsi, %rdi -; X64-NEXT: imulq %r13, %rdi -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rdi, %rdx -; X64-NEXT: imulq %rsi, %r15 -; X64-NEXT: addq %rdx, %r15 ; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Reload -; X64-NEXT: imulq %rbp, %rbx -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: andq %r8, %rbx +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: imulq %rsi, %r8 -; X64-NEXT: addq %rbx, %r8 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: addq %rax, %rcx -; X64-NEXT: adcq %r15, %r8 -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: subq %rbx, %rdi +; X64-NEXT: andq %rsi, %r11 +; X64-NEXT: subq %r11, %rdi +; X64-NEXT: movq %rsi, %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload +; X64-NEXT: andq %r10, %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: addq %rbx, %r15 +; X64-NEXT: subq %r15, %rbp +; X64-NEXT: andq %rsi, %rcx +; X64-NEXT: subq %rcx, %rbp +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: addq %rax, %r9 ; X64-NEXT: adcq %rdi, %rbp +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rbx, %r11 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: adcq %rbx, %rdi ; X64-NEXT: setb %bl -; X64-NEXT: addq %rax, %rbp -; X64-NEXT: movzbl %bl, %r9d -; X64-NEXT: adcq %rdx, %r9 -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: adcq %r8, %r9 -; X64-NEXT: sarq $63, %r13 +; X64-NEXT: addq %rax, %rdi +; X64-NEXT: movzbl %bl, %r15d +; X64-NEXT: adcq %rdx, %r15 +; X64-NEXT: addq %r9, %rdi +; X64-NEXT: adcq %rbp, %r15 +; X64-NEXT: sarq $63, %r8 +; X64-NEXT: movq %r8, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: imulq %r13, %rsi -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: andq %rax, %rcx +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: subq %rcx, %rsi +; X64-NEXT: movq %r8, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload -; X64-NEXT: movq %r10, %rcx -; X64-NEXT: imulq %r13, %rcx -; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: andq %r10, %rax +; X64-NEXT: subq %rax, %rsi +; X64-NEXT: movq %r8, %r13 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: imulq %r13, %rsi -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq %r14, %rax -; X64-NEXT: imulq %r13, %rax -; X64-NEXT: addq %rdx, %rax -; X64-NEXT: addq %rdi, %r8 -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: addq %rbx, %rsi +; X64-NEXT: andq %rax, %r13 +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: subq %r13, %rcx +; X64-NEXT: andq %r8, %r14 +; X64-NEXT: subq %r14, %rcx +; X64-NEXT: addq %r9, %rbp +; X64-NEXT: adcq %rsi, %rcx +; X64-NEXT: movq %r9, %r14 +; X64-NEXT: addq %rbx, %r14 ; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %r10 -; X64-NEXT: addq %rax, %rsi +; X64-NEXT: addq %rax, %r14 ; X64-NEXT: adcq %rdx, %rbx -; X64-NEXT: setb %cl +; X64-NEXT: setb %sil ; X64-NEXT: addq %rax, %rbx -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rdx, %rax -; X64-NEXT: addq %r8, %rbx -; X64-NEXT: adcq %r14, %rax -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload -; X64-NEXT: adcq %r15, %rsi -; X64-NEXT: adcq %rbp, %rbx -; X64-NEXT: adcq %r9, %rax -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload -; X64-NEXT: adcq %r11, %rbx -; X64-NEXT: adcq %r12, %rax +; X64-NEXT: addq %rbp, %rbx +; X64-NEXT: adcq %rcx, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Folded Reload +; X64-NEXT: adcq %r11, %r14 +; X64-NEXT: adcq %rdi, %rbx +; X64-NEXT: adcq %r15, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload +; X64-NEXT: adcq %r12, %rbx +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: sarq $63, %rcx ; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: xorq %rcx, %rsi -; X64-NEXT: orq %rax, %rsi +; X64-NEXT: xorq %rcx, %r14 +; X64-NEXT: orq %rax, %r14 ; X64-NEXT: xorq %rcx, %rbx -; X64-NEXT: xorq %rdi, %rcx +; X64-NEXT: xorq %r9, %rcx ; X64-NEXT: orq %rbx, %rcx -; X64-NEXT: orq %rsi, %rcx +; X64-NEXT: orq %r14, %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax ; X64-NEXT: movq %rdx, 24(%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload @@ -613,400 +628,399 @@ ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $156, %esp -; X86-NEXT: .cfi_def_cfa_offset 176 +; X86-NEXT: subl $152, %esp +; X86-NEXT: .cfi_def_cfa_offset 172 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: adcl %ebp, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ebp ## 4-byte Folded Reload ; X86-NEXT: setb (%esp) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebx, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %edi, %ebp -; X86-NEXT: setb %bl +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: addl %ebp, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: adcl %ebp, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %edi, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebp ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl (%esp), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movzbl %bl, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movl (%esp), %edx ## 4-byte Reload +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %ecx, %esi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movl %edi, %esi -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload -; X86-NEXT: adcl %edi, %eax -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: movl %ebp, %ebx ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb %cl +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: setb %cl +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl %bl, %ecx -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: addl %edi, %ebx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: addl %ebx, %edi +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: adcl $0, %edx @@ -1019,9 +1033,9 @@ ; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %edi, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: adcl $0, %esi @@ -1034,41 +1048,13 @@ ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebx @@ -1077,89 +1063,117 @@ ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ecx -; X86-NEXT: setb %bl -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: setb %cl +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: setb %cl +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %esi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: addl %edi, %ebp ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: adcl %ebx, %edi ; X86-NEXT: setb (%esp) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %edi, %ebx ; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload -; X86-NEXT: movl %edx, %esi -; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: addl %esi, %edx +; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload @@ -1175,25 +1189,25 @@ ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sarl $31, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: addl %eax, %ebx ; X86-NEXT: movzbl %cl, %eax @@ -1201,76 +1215,75 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl (%esp), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: setb %al -; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %al, %edx ; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: movl (%esp), %eax ## 4-byte Reload +; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: setb %al -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %edx, %ecx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: mull %edx +; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: imull %edi, %esi -; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: subl %ecx, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl %eax, %esi +; X86-NEXT: movl %edi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: andl %eax, %ebx ; X86-NEXT: mull %edi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: subl %ebx, %ecx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl (%esp), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: movl %eax, %esi ; X86-NEXT: addl %ebp, %esi @@ -1280,263 +1293,266 @@ ; X86-NEXT: addl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %edx, %ebp -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: setb %bl ; X86-NEXT: addl %eax, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %edi, %ecx -; X86-NEXT: addl %edx, %ecx +; X86-NEXT: movzbl %bl, %ebx +; X86-NEXT: adcl %edx, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: imull %edi, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: movl (%esp), %edx ## 4-byte Reload +; X86-NEXT: andl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: subl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %edi, %edx +; X86-NEXT: subl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl %edi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl %edx, %esi -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: subl %edx, %esi +; X86-NEXT: andl {{[0-9]+}}(%esp), %edi +; X86-NEXT: subl %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: movl %eax, %edx ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: setb %cl -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: addl %eax, %edx ; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %edi, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: movl (%esp), %esi ## 4-byte Reload +; X86-NEXT: addl %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: adcl %ebx, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %esi, (%esp) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edx, %esi +; X86-NEXT: adcl %edx, %edi ; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movzbl %bl, %ebx -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %eax, %edi +; X86-NEXT: movzbl %bl, %ebp +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %edx, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl %edx, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %eax, %ebp -; X86-NEXT: adcl %edx, %edi -; X86-NEXT: setb %cl -; X86-NEXT: addl %eax, %edi -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %edi, %edx +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: adcl %edx, %ecx +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: addl %eax, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl %ebx, %eax -; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %edi, %edx +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl %eax, %edi ; X86-NEXT: setb %al -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: addl %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: addl %ebp, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl %edi, %eax +; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: imull %ebp, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: andl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %ebp, %eax -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: andl %edx, %eax +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ebp, %ecx +; X86-NEXT: andl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ebp, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: addl %esi, %eax +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: subl %ecx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %dl ; X86-NEXT: addl %ebx, %edi -; X86-NEXT: movzbl %dl, %eax -; X86-NEXT: adcl %ecx, %eax +; X86-NEXT: movzbl %dl, %ecx +; X86-NEXT: adcl %ebp, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ebp, %ecx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: andl %ebp, %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: imull %ebp, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edx, %esi +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: andl %ebp, %ecx +; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %ebp, %eax -; X86-NEXT: addl %esi, %eax -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: mull %ebp +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: subl %ecx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: movl %edx, %esi -; X86-NEXT: adcl $0, %esi -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %edx, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: andl %ecx, %eax +; X86-NEXT: subl %eax, %ebp +; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %ebx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: adcl $0, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: setb %cl +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl (%esp), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: xorl %edi, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %edi +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: xorl %ecx, %esi ; X86-NEXT: orl %edx, %esi -; X86-NEXT: xorl %edi, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: xorl %edi, %edx -; X86-NEXT: xorl %edi, %eax -; X86-NEXT: orl %edx, %eax -; X86-NEXT: xorl %edi, %ebp -; X86-NEXT: orl %eax, %ebp -; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: orl %ebp, %edi -; X86-NEXT: orl %ecx, %edi +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: xorl %ecx, %ebx +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, 28(%eax) +; X86-NEXT: movl %ebp, 28(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -1552,7 +1568,7 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, 24(%eax) ; X86-NEXT: setne %al -; X86-NEXT: addl $156, %esp +; X86-NEXT: addl $152, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -3301,32 +3301,33 @@ ; SSE2-NEXT: movq %rdx, %r15 ; SSE2-NEXT: movq %rsi, %r13 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: movq %rsi, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movq %r14, %rsi -; SSE2-NEXT: imulq %rcx, %rsi +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: movq %rsi, %rbx +; SSE2-NEXT: andq %r14, %rbx ; SSE2-NEXT: movq %r14, %rax -; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: mulq %rsi ; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: addq %rsi, %rdx -; SSE2-NEXT: imulq %r9, %rcx -; SSE2-NEXT: addq %rdx, %rcx -; SSE2-NEXT: movq %r9, %rbx -; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movq %rbx, %rsi -; SSE2-NEXT: imulq %r13, %rsi -; SSE2-NEXT: movq %rbx, %rax +; SSE2-NEXT: movq %rdx, %rcx +; SSE2-NEXT: subq %rbx, %rcx +; SSE2-NEXT: andq %r9, %rsi +; SSE2-NEXT: subq %rsi, %rcx +; SSE2-NEXT: movq %r9, %rsi +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: movq %rsi, %rbp +; SSE2-NEXT: andq %r13, %rbp +; SSE2-NEXT: movq %rsi, %rax ; SSE2-NEXT: mulq %rdi ; SSE2-NEXT: movq %rax, %r12 -; SSE2-NEXT: addq %rsi, %rdx -; SSE2-NEXT: imulq %rdi, %rbx -; SSE2-NEXT: addq %rdx, %rbx +; SSE2-NEXT: movq %rdx, %rbx +; SSE2-NEXT: subq %rbp, %rbx +; SSE2-NEXT: andq %rdi, %rsi +; SSE2-NEXT: subq %rsi, %rbx ; SSE2-NEXT: addq %r10, %r12 ; SSE2-NEXT: adcq %rcx, %rbx ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r14 ; SSE2-NEXT: movq %rdx, %rbp -; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq %r13, %rax ; SSE2-NEXT: mulq %r14 ; SSE2-NEXT: movq %rdx, %rsi @@ -3357,58 +3358,59 @@ ; SSE2-NEXT: setne %r12b ; SSE2-NEXT: movq %r11, %rdi ; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: movq %rdi, %rbp +; SSE2-NEXT: andq %r8, %rbp ; SSE2-NEXT: movq %r8, %rax -; SSE2-NEXT: movq %r8, %rsi -; SSE2-NEXT: imulq %rdi, %rsi -; SSE2-NEXT: movq %r8, %rbx ; SSE2-NEXT: mulq %rdi ; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: addq %rsi, %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: imulq %r8, %rdi -; SSE2-NEXT: addq %rdx, %rdi -; SSE2-NEXT: movq %r8, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movq %rsi, %rbp -; SSE2-NEXT: imulq %r11, %rbp -; SSE2-NEXT: movq %rsi, %rax +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: subq %rbp, %rsi +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: andq %r10, %rdi +; SSE2-NEXT: subq %rdi, %rsi +; SSE2-NEXT: movq %r10, %rdi +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: movq %rdi, %rbp +; SSE2-NEXT: andq %r11, %rbp +; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r15 ; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: addq %rbp, %rdx -; SSE2-NEXT: imulq %r15, %rsi -; SSE2-NEXT: addq %rdx, %rsi +; SSE2-NEXT: movq %rdx, %rbx +; SSE2-NEXT: subq %rbp, %rbx +; SSE2-NEXT: andq %r15, %rdi +; SSE2-NEXT: subq %rdi, %rbx ; SSE2-NEXT: addq %rcx, %r14 -; SSE2-NEXT: adcq %rdi, %rsi +; SSE2-NEXT: adcq %rsi, %rbx ; SSE2-NEXT: movq %r15, %rax -; SSE2-NEXT: mulq %rbx +; SSE2-NEXT: mulq %r8 ; SSE2-NEXT: movq %rdx, %rcx ; SSE2-NEXT: movq %rax, %r9 ; SSE2-NEXT: movq %r11, %rax -; SSE2-NEXT: mulq %rbx -; SSE2-NEXT: movq %rdx, %rbx -; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: addq %rcx, %rbp -; SSE2-NEXT: adcq $0, %rbx -; SSE2-NEXT: movq %r15, %rax ; SSE2-NEXT: mulq %r8 +; SSE2-NEXT: movq %rdx, %rdi +; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: addq %rcx, %rsi +; SSE2-NEXT: adcq $0, %rdi +; SSE2-NEXT: movq %r15, %rax +; SSE2-NEXT: mulq %r10 ; SSE2-NEXT: movq %rdx, %rcx -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: addq %rbp, %rdi -; SSE2-NEXT: adcq %rbx, %rcx +; SSE2-NEXT: movq %rax, %rbp +; SSE2-NEXT: addq %rsi, %rbp +; SSE2-NEXT: adcq %rdi, %rcx ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %ebp +; SSE2-NEXT: movzbl %al, %esi ; SSE2-NEXT: movq %r11, %rax -; SSE2-NEXT: mulq %r8 +; SSE2-NEXT: mulq %r10 ; SSE2-NEXT: addq %rcx, %rax -; SSE2-NEXT: adcq %rbp, %rdx -; SSE2-NEXT: addq %r14, %rax ; SSE2-NEXT: adcq %rsi, %rdx -; SSE2-NEXT: movq %rdi, 24(%r13) -; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: xorq %rdi, %rdx -; SSE2-NEXT: xorq %rax, %rdi +; SSE2-NEXT: addq %r14, %rax +; SSE2-NEXT: adcq %rbx, %rdx +; SSE2-NEXT: movq %rbp, 24(%r13) +; SSE2-NEXT: sarq $63, %rbp +; SSE2-NEXT: xorq %rbp, %rdx +; SSE2-NEXT: xorq %rax, %rbp ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: orq %rdx, %rbp ; SSE2-NEXT: setne %al ; SSE2-NEXT: negl %eax ; SSE2-NEXT: movd %eax, %xmm1 @@ -3416,7 +3418,8 @@ ; SSE2-NEXT: movd %r12d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: movq %r9, 16(%r13) -; SSE2-NEXT: movq %r10, (%r13) +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE2-NEXT: movq %rax, (%r13) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -3438,32 +3441,33 @@ ; SSSE3-NEXT: movq %rdx, %r15 ; SSSE3-NEXT: movq %rsi, %r13 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSSE3-NEXT: movq %rsi, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movq %r14, %rsi -; SSSE3-NEXT: imulq %rcx, %rsi +; SSSE3-NEXT: sarq $63, %rsi +; SSSE3-NEXT: movq %rsi, %rbx +; SSSE3-NEXT: andq %r14, %rbx ; SSSE3-NEXT: movq %r14, %rax -; SSSE3-NEXT: mulq %rcx +; SSSE3-NEXT: mulq %rsi ; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: addq %rsi, %rdx -; SSSE3-NEXT: imulq %r9, %rcx -; SSSE3-NEXT: addq %rdx, %rcx -; SSSE3-NEXT: movq %r9, %rbx -; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movq %rbx, %rsi -; SSSE3-NEXT: imulq %r13, %rsi -; SSSE3-NEXT: movq %rbx, %rax +; SSSE3-NEXT: movq %rdx, %rcx +; SSSE3-NEXT: subq %rbx, %rcx +; SSSE3-NEXT: andq %r9, %rsi +; SSSE3-NEXT: subq %rsi, %rcx +; SSSE3-NEXT: movq %r9, %rsi +; SSSE3-NEXT: sarq $63, %rsi +; SSSE3-NEXT: movq %rsi, %rbp +; SSSE3-NEXT: andq %r13, %rbp +; SSSE3-NEXT: movq %rsi, %rax ; SSSE3-NEXT: mulq %rdi ; SSSE3-NEXT: movq %rax, %r12 -; SSSE3-NEXT: addq %rsi, %rdx -; SSSE3-NEXT: imulq %rdi, %rbx -; SSSE3-NEXT: addq %rdx, %rbx +; SSSE3-NEXT: movq %rdx, %rbx +; SSSE3-NEXT: subq %rbp, %rbx +; SSSE3-NEXT: andq %rdi, %rsi +; SSSE3-NEXT: subq %rsi, %rbx ; SSSE3-NEXT: addq %r10, %r12 ; SSSE3-NEXT: adcq %rcx, %rbx ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r14 ; SSSE3-NEXT: movq %rdx, %rbp -; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSSE3-NEXT: movq %r13, %rax ; SSSE3-NEXT: mulq %r14 ; SSSE3-NEXT: movq %rdx, %rsi @@ -3494,58 +3498,59 @@ ; SSSE3-NEXT: setne %r12b ; SSSE3-NEXT: movq %r11, %rdi ; SSSE3-NEXT: sarq $63, %rdi +; SSSE3-NEXT: movq %rdi, %rbp +; SSSE3-NEXT: andq %r8, %rbp ; SSSE3-NEXT: movq %r8, %rax -; SSSE3-NEXT: movq %r8, %rsi -; SSSE3-NEXT: imulq %rdi, %rsi -; SSSE3-NEXT: movq %r8, %rbx ; SSSE3-NEXT: mulq %rdi ; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: addq %rsi, %rdx -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSSE3-NEXT: imulq %r8, %rdi -; SSSE3-NEXT: addq %rdx, %rdi -; SSSE3-NEXT: movq %r8, %rsi -; SSSE3-NEXT: sarq $63, %rsi -; SSSE3-NEXT: movq %rsi, %rbp -; SSSE3-NEXT: imulq %r11, %rbp -; SSSE3-NEXT: movq %rsi, %rax +; SSSE3-NEXT: movq %rdx, %rsi +; SSSE3-NEXT: subq %rbp, %rsi +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSSE3-NEXT: andq %r10, %rdi +; SSSE3-NEXT: subq %rdi, %rsi +; SSSE3-NEXT: movq %r10, %rdi +; SSSE3-NEXT: sarq $63, %rdi +; SSSE3-NEXT: movq %rdi, %rbp +; SSSE3-NEXT: andq %r11, %rbp +; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r15 ; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: addq %rbp, %rdx -; SSSE3-NEXT: imulq %r15, %rsi -; SSSE3-NEXT: addq %rdx, %rsi +; SSSE3-NEXT: movq %rdx, %rbx +; SSSE3-NEXT: subq %rbp, %rbx +; SSSE3-NEXT: andq %r15, %rdi +; SSSE3-NEXT: subq %rdi, %rbx ; SSSE3-NEXT: addq %rcx, %r14 -; SSSE3-NEXT: adcq %rdi, %rsi +; SSSE3-NEXT: adcq %rsi, %rbx ; SSSE3-NEXT: movq %r15, %rax -; SSSE3-NEXT: mulq %rbx +; SSSE3-NEXT: mulq %r8 ; SSSE3-NEXT: movq %rdx, %rcx ; SSSE3-NEXT: movq %rax, %r9 ; SSSE3-NEXT: movq %r11, %rax -; SSSE3-NEXT: mulq %rbx -; SSSE3-NEXT: movq %rdx, %rbx -; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: addq %rcx, %rbp -; SSSE3-NEXT: adcq $0, %rbx -; SSSE3-NEXT: movq %r15, %rax ; SSSE3-NEXT: mulq %r8 +; SSSE3-NEXT: movq %rdx, %rdi +; SSSE3-NEXT: movq %rax, %rsi +; SSSE3-NEXT: addq %rcx, %rsi +; SSSE3-NEXT: adcq $0, %rdi +; SSSE3-NEXT: movq %r15, %rax +; SSSE3-NEXT: mulq %r10 ; SSSE3-NEXT: movq %rdx, %rcx -; SSSE3-NEXT: movq %rax, %rdi -; SSSE3-NEXT: addq %rbp, %rdi -; SSSE3-NEXT: adcq %rbx, %rcx +; SSSE3-NEXT: movq %rax, %rbp +; SSSE3-NEXT: addq %rsi, %rbp +; SSSE3-NEXT: adcq %rdi, %rcx ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %ebp +; SSSE3-NEXT: movzbl %al, %esi ; SSSE3-NEXT: movq %r11, %rax -; SSSE3-NEXT: mulq %r8 +; SSSE3-NEXT: mulq %r10 ; SSSE3-NEXT: addq %rcx, %rax -; SSSE3-NEXT: adcq %rbp, %rdx -; SSSE3-NEXT: addq %r14, %rax ; SSSE3-NEXT: adcq %rsi, %rdx -; SSSE3-NEXT: movq %rdi, 24(%r13) -; SSSE3-NEXT: sarq $63, %rdi -; SSSE3-NEXT: xorq %rdi, %rdx -; SSSE3-NEXT: xorq %rax, %rdi +; SSSE3-NEXT: addq %r14, %rax +; SSSE3-NEXT: adcq %rbx, %rdx +; SSSE3-NEXT: movq %rbp, 24(%r13) +; SSSE3-NEXT: sarq $63, %rbp +; SSSE3-NEXT: xorq %rbp, %rdx +; SSSE3-NEXT: xorq %rax, %rbp ; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: orq %rdx, %rdi +; SSSE3-NEXT: orq %rdx, %rbp ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: negl %eax ; SSSE3-NEXT: movd %eax, %xmm1 @@ -3553,7 +3558,8 @@ ; SSSE3-NEXT: movd %r12d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSSE3-NEXT: movq %r9, 16(%r13) -; SSSE3-NEXT: movq %r10, (%r13) +; SSSE3-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSSE3-NEXT: movq %rax, (%r13) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 @@ -3575,32 +3581,33 @@ ; SSE41-NEXT: movq %rdx, %r15 ; SSE41-NEXT: movq %rsi, %r13 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE41-NEXT: movq %rsi, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %r14, %rsi -; SSE41-NEXT: imulq %rcx, %rsi +; SSE41-NEXT: sarq $63, %rsi +; SSE41-NEXT: movq %rsi, %rbx +; SSE41-NEXT: andq %r14, %rbx ; SSE41-NEXT: movq %r14, %rax -; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: mulq %rsi ; SSE41-NEXT: movq %rax, %r10 -; SSE41-NEXT: addq %rsi, %rdx -; SSE41-NEXT: imulq %r9, %rcx -; SSE41-NEXT: addq %rdx, %rcx -; SSE41-NEXT: movq %r9, %rbx -; SSE41-NEXT: sarq $63, %rbx -; SSE41-NEXT: movq %rbx, %rsi -; SSE41-NEXT: imulq %r13, %rsi -; SSE41-NEXT: movq %rbx, %rax +; SSE41-NEXT: movq %rdx, %rcx +; SSE41-NEXT: subq %rbx, %rcx +; SSE41-NEXT: andq %r9, %rsi +; SSE41-NEXT: subq %rsi, %rcx +; SSE41-NEXT: movq %r9, %rsi +; SSE41-NEXT: sarq $63, %rsi +; SSE41-NEXT: movq %rsi, %rbp +; SSE41-NEXT: andq %r13, %rbp +; SSE41-NEXT: movq %rsi, %rax ; SSE41-NEXT: mulq %rdi ; SSE41-NEXT: movq %rax, %r12 -; SSE41-NEXT: addq %rsi, %rdx -; SSE41-NEXT: imulq %rdi, %rbx -; SSE41-NEXT: addq %rdx, %rbx +; SSE41-NEXT: movq %rdx, %rbx +; SSE41-NEXT: subq %rbp, %rbx +; SSE41-NEXT: andq %rdi, %rsi +; SSE41-NEXT: subq %rsi, %rbx ; SSE41-NEXT: addq %r10, %r12 ; SSE41-NEXT: adcq %rcx, %rbx ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r14 ; SSE41-NEXT: movq %rdx, %rbp -; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE41-NEXT: movq %r13, %rax ; SSE41-NEXT: mulq %r14 ; SSE41-NEXT: movq %rdx, %rsi @@ -3631,65 +3638,67 @@ ; SSE41-NEXT: setne %r12b ; SSE41-NEXT: movq %r11, %rdi ; SSE41-NEXT: sarq $63, %rdi +; SSE41-NEXT: movq %rdi, %rbp +; SSE41-NEXT: andq %r8, %rbp ; SSE41-NEXT: movq %r8, %rax -; SSE41-NEXT: movq %r8, %rsi -; SSE41-NEXT: imulq %rdi, %rsi -; SSE41-NEXT: movq %r8, %rbx ; SSE41-NEXT: mulq %rdi ; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: addq %rsi, %rdx -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE41-NEXT: imulq %r8, %rdi -; SSE41-NEXT: addq %rdx, %rdi -; SSE41-NEXT: movq %r8, %rsi -; SSE41-NEXT: sarq $63, %rsi -; SSE41-NEXT: movq %rsi, %rbp -; SSE41-NEXT: imulq %r11, %rbp -; SSE41-NEXT: movq %rsi, %rax +; SSE41-NEXT: movq %rdx, %rsi +; SSE41-NEXT: subq %rbp, %rsi +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE41-NEXT: andq %r10, %rdi +; SSE41-NEXT: subq %rdi, %rsi +; SSE41-NEXT: movq %r10, %rdi +; SSE41-NEXT: sarq $63, %rdi +; SSE41-NEXT: movq %rdi, %rbp +; SSE41-NEXT: andq %r11, %rbp +; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r15 ; SSE41-NEXT: movq %rax, %r14 -; SSE41-NEXT: addq %rbp, %rdx -; SSE41-NEXT: imulq %r15, %rsi -; SSE41-NEXT: addq %rdx, %rsi +; SSE41-NEXT: movq %rdx, %rbx +; SSE41-NEXT: subq %rbp, %rbx +; SSE41-NEXT: andq %r15, %rdi +; SSE41-NEXT: subq %rdi, %rbx ; SSE41-NEXT: addq %rcx, %r14 -; SSE41-NEXT: adcq %rdi, %rsi +; SSE41-NEXT: adcq %rsi, %rbx ; SSE41-NEXT: movq %r15, %rax -; SSE41-NEXT: mulq %rbx +; SSE41-NEXT: mulq %r8 ; SSE41-NEXT: movq %rdx, %rcx ; SSE41-NEXT: movq %rax, %r9 ; SSE41-NEXT: movq %r11, %rax -; SSE41-NEXT: mulq %rbx -; SSE41-NEXT: movq %rdx, %rbx -; SSE41-NEXT: movq %rax, %rbp -; SSE41-NEXT: addq %rcx, %rbp -; SSE41-NEXT: adcq $0, %rbx -; SSE41-NEXT: movq %r15, %rax ; SSE41-NEXT: mulq %r8 +; SSE41-NEXT: movq %rdx, %rdi +; SSE41-NEXT: movq %rax, %rsi +; SSE41-NEXT: addq %rcx, %rsi +; SSE41-NEXT: adcq $0, %rdi +; SSE41-NEXT: movq %r15, %rax +; SSE41-NEXT: mulq %r10 ; SSE41-NEXT: movq %rdx, %rcx -; SSE41-NEXT: movq %rax, %rdi -; SSE41-NEXT: addq %rbp, %rdi -; SSE41-NEXT: adcq %rbx, %rcx +; SSE41-NEXT: movq %rax, %rbp +; SSE41-NEXT: addq %rsi, %rbp +; SSE41-NEXT: adcq %rdi, %rcx ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %ebp +; SSE41-NEXT: movzbl %al, %esi ; SSE41-NEXT: movq %r11, %rax -; SSE41-NEXT: mulq %r8 +; SSE41-NEXT: mulq %r10 ; SSE41-NEXT: addq %rcx, %rax -; SSE41-NEXT: adcq %rbp, %rdx -; SSE41-NEXT: addq %r14, %rax ; SSE41-NEXT: adcq %rsi, %rdx -; SSE41-NEXT: movq %rdi, 24(%r13) -; SSE41-NEXT: sarq $63, %rdi -; SSE41-NEXT: xorq %rdi, %rdx -; SSE41-NEXT: xorq %rax, %rdi +; SSE41-NEXT: addq %r14, %rax +; SSE41-NEXT: adcq %rbx, %rdx +; SSE41-NEXT: movq %rbp, 24(%r13) +; SSE41-NEXT: sarq $63, %rbp +; SSE41-NEXT: xorq %rbp, %rdx +; SSE41-NEXT: xorq %rax, %rbp ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: orq %rdx, %rbp ; SSE41-NEXT: setne %al ; SSE41-NEXT: negl %eax ; SSE41-NEXT: negl %r12d ; SSE41-NEXT: movd %r12d, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 ; SSE41-NEXT: movq %r9, 16(%r13) -; SSE41-NEXT: movq %r10, (%r13) +; SSE41-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE41-NEXT: movq %rax, (%r13) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r13 @@ -3711,32 +3720,33 @@ ; AVX-NEXT: movq %rdx, %r15 ; AVX-NEXT: movq %rsi, %r13 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: movq %rsi, %rcx -; AVX-NEXT: sarq $63, %rcx -; AVX-NEXT: movq %r14, %rsi -; AVX-NEXT: imulq %rcx, %rsi +; AVX-NEXT: sarq $63, %rsi +; AVX-NEXT: movq %rsi, %rbx +; AVX-NEXT: andq %r14, %rbx ; AVX-NEXT: movq %r14, %rax -; AVX-NEXT: mulq %rcx +; AVX-NEXT: mulq %rsi ; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: addq %rsi, %rdx -; AVX-NEXT: imulq %r9, %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: movq %r9, %rbx -; AVX-NEXT: sarq $63, %rbx -; AVX-NEXT: movq %rbx, %rsi -; AVX-NEXT: imulq %r13, %rsi -; AVX-NEXT: movq %rbx, %rax +; AVX-NEXT: movq %rdx, %rcx +; AVX-NEXT: subq %rbx, %rcx +; AVX-NEXT: andq %r9, %rsi +; AVX-NEXT: subq %rsi, %rcx +; AVX-NEXT: movq %r9, %rsi +; AVX-NEXT: sarq $63, %rsi +; AVX-NEXT: movq %rsi, %rbp +; AVX-NEXT: andq %r13, %rbp +; AVX-NEXT: movq %rsi, %rax ; AVX-NEXT: mulq %rdi ; AVX-NEXT: movq %rax, %r12 -; AVX-NEXT: addq %rsi, %rdx -; AVX-NEXT: imulq %rdi, %rbx -; AVX-NEXT: addq %rdx, %rbx +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: subq %rbp, %rbx +; AVX-NEXT: andq %rdi, %rsi +; AVX-NEXT: subq %rsi, %rbx ; AVX-NEXT: addq %r10, %r12 ; AVX-NEXT: adcq %rcx, %rbx ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %r14 ; AVX-NEXT: movq %rdx, %rbp -; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX-NEXT: movq %r13, %rax ; AVX-NEXT: mulq %r14 ; AVX-NEXT: movq %rdx, %rsi @@ -3767,65 +3777,67 @@ ; AVX-NEXT: setne %r12b ; AVX-NEXT: movq %r11, %rdi ; AVX-NEXT: sarq $63, %rdi +; AVX-NEXT: movq %rdi, %rbp +; AVX-NEXT: andq %r8, %rbp ; AVX-NEXT: movq %r8, %rax -; AVX-NEXT: movq %r8, %rsi -; AVX-NEXT: imulq %rdi, %rsi -; AVX-NEXT: movq %r8, %rbx ; AVX-NEXT: mulq %rdi ; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: addq %rsi, %rdx -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: imulq %r8, %rdi -; AVX-NEXT: addq %rdx, %rdi -; AVX-NEXT: movq %r8, %rsi -; AVX-NEXT: sarq $63, %rsi -; AVX-NEXT: movq %rsi, %rbp -; AVX-NEXT: imulq %r11, %rbp -; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: movq %rdx, %rsi +; AVX-NEXT: subq %rbp, %rsi +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX-NEXT: andq %r10, %rdi +; AVX-NEXT: subq %rdi, %rsi +; AVX-NEXT: movq %r10, %rdi +; AVX-NEXT: sarq $63, %rdi +; AVX-NEXT: movq %rdi, %rbp +; AVX-NEXT: andq %r11, %rbp +; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %r15 ; AVX-NEXT: movq %rax, %r14 -; AVX-NEXT: addq %rbp, %rdx -; AVX-NEXT: imulq %r15, %rsi -; AVX-NEXT: addq %rdx, %rsi +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: subq %rbp, %rbx +; AVX-NEXT: andq %r15, %rdi +; AVX-NEXT: subq %rdi, %rbx ; AVX-NEXT: addq %rcx, %r14 -; AVX-NEXT: adcq %rdi, %rsi +; AVX-NEXT: adcq %rsi, %rbx ; AVX-NEXT: movq %r15, %rax -; AVX-NEXT: mulq %rbx +; AVX-NEXT: mulq %r8 ; AVX-NEXT: movq %rdx, %rcx ; AVX-NEXT: movq %rax, %r9 ; AVX-NEXT: movq %r11, %rax -; AVX-NEXT: mulq %rbx -; AVX-NEXT: movq %rdx, %rbx -; AVX-NEXT: movq %rax, %rbp -; AVX-NEXT: addq %rcx, %rbp -; AVX-NEXT: adcq $0, %rbx -; AVX-NEXT: movq %r15, %rax ; AVX-NEXT: mulq %r8 +; AVX-NEXT: movq %rdx, %rdi +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: addq %rcx, %rsi +; AVX-NEXT: adcq $0, %rdi +; AVX-NEXT: movq %r15, %rax +; AVX-NEXT: mulq %r10 ; AVX-NEXT: movq %rdx, %rcx -; AVX-NEXT: movq %rax, %rdi -; AVX-NEXT: addq %rbp, %rdi -; AVX-NEXT: adcq %rbx, %rcx +; AVX-NEXT: movq %rax, %rbp +; AVX-NEXT: addq %rsi, %rbp +; AVX-NEXT: adcq %rdi, %rcx ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %ebp +; AVX-NEXT: movzbl %al, %esi ; AVX-NEXT: movq %r11, %rax -; AVX-NEXT: mulq %r8 +; AVX-NEXT: mulq %r10 ; AVX-NEXT: addq %rcx, %rax -; AVX-NEXT: adcq %rbp, %rdx -; AVX-NEXT: addq %r14, %rax ; AVX-NEXT: adcq %rsi, %rdx -; AVX-NEXT: movq %rdi, 24(%r13) -; AVX-NEXT: sarq $63, %rdi -; AVX-NEXT: xorq %rdi, %rdx -; AVX-NEXT: xorq %rax, %rdi +; AVX-NEXT: addq %r14, %rax +; AVX-NEXT: adcq %rbx, %rdx +; AVX-NEXT: movq %rbp, 24(%r13) +; AVX-NEXT: sarq $63, %rbp +; AVX-NEXT: xorq %rbp, %rdx +; AVX-NEXT: xorq %rax, %rbp ; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: orq %rdx, %rdi +; AVX-NEXT: orq %rdx, %rbp ; AVX-NEXT: setne %al ; AVX-NEXT: negl %eax ; AVX-NEXT: negl %r12d ; AVX-NEXT: vmovd %r12d, %xmm0 ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; AVX-NEXT: movq %r9, 16(%r13) -; AVX-NEXT: movq %r10, (%r13) +; AVX-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX-NEXT: movq %rax, (%r13) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r13 @@ -3842,119 +3854,125 @@ ; AVX512F-NEXT: pushq %r13 ; AVX512F-NEXT: pushq %r12 ; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: movq %r9, %r10 +; AVX512F-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512F-NEXT: movq %r8, %r9 -; AVX512F-NEXT: movq %rcx, %r14 -; AVX512F-NEXT: movq %rdx, %rcx +; AVX512F-NEXT: movq %rcx, %r12 +; AVX512F-NEXT: movq %rdx, %rbx ; AVX512F-NEXT: movq %rsi, %r11 -; AVX512F-NEXT: movq %rdi, %r15 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512F-NEXT: movq %rdi, %r14 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512F-NEXT: movq %r14, %rdi -; AVX512F-NEXT: sarq $63, %rdi -; AVX512F-NEXT: movq %r12, %rbx -; AVX512F-NEXT: imulq %rdi, %rbx -; AVX512F-NEXT: movq %r12, %rax -; AVX512F-NEXT: mulq %rdi -; AVX512F-NEXT: movq %rax, %rsi -; AVX512F-NEXT: addq %rbx, %rdx -; AVX512F-NEXT: imulq %r8, %rdi -; AVX512F-NEXT: addq %rdx, %rdi -; AVX512F-NEXT: movq %r8, %rbx -; AVX512F-NEXT: sarq $63, %rbx -; AVX512F-NEXT: movq %rbx, %rbp -; AVX512F-NEXT: imulq %r14, %rbp -; AVX512F-NEXT: movq %rbx, %rax -; AVX512F-NEXT: mulq %rcx +; AVX512F-NEXT: movq %rcx, %rsi +; AVX512F-NEXT: sarq $63, %rsi +; AVX512F-NEXT: movq %rsi, %rdi +; AVX512F-NEXT: andq %r15, %rdi +; AVX512F-NEXT: movq %r15, %rax +; AVX512F-NEXT: mulq %rsi +; AVX512F-NEXT: movq %rax, %r10 +; AVX512F-NEXT: movq %rdx, %rcx +; AVX512F-NEXT: subq %rdi, %rcx +; AVX512F-NEXT: andq %r8, %rsi +; AVX512F-NEXT: subq %rsi, %rcx +; AVX512F-NEXT: movq %r8, %rsi +; AVX512F-NEXT: sarq $63, %rsi +; AVX512F-NEXT: movq %rsi, %rbp +; AVX512F-NEXT: andq %r12, %rbp +; AVX512F-NEXT: movq %rsi, %rax +; AVX512F-NEXT: mulq %rbx ; AVX512F-NEXT: movq %rax, %r13 -; AVX512F-NEXT: addq %rbp, %rdx -; AVX512F-NEXT: imulq %rcx, %rbx -; AVX512F-NEXT: addq %rdx, %rbx -; AVX512F-NEXT: addq %rsi, %r13 -; AVX512F-NEXT: adcq %rdi, %rbx -; AVX512F-NEXT: movq %rcx, %rax -; AVX512F-NEXT: mulq %r12 -; AVX512F-NEXT: movq %rdx, %rbp -; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512F-NEXT: movq %r14, %rax -; AVX512F-NEXT: mulq %r12 ; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: movq %rax, %rsi -; AVX512F-NEXT: addq %rbp, %rsi -; AVX512F-NEXT: adcq $0, %rdi -; AVX512F-NEXT: movq %rcx, %rax -; AVX512F-NEXT: mulq %r8 +; AVX512F-NEXT: subq %rbp, %rdi +; AVX512F-NEXT: andq %rbx, %rsi +; AVX512F-NEXT: subq %rsi, %rdi +; AVX512F-NEXT: addq %r10, %r13 +; AVX512F-NEXT: adcq %rcx, %rdi +; AVX512F-NEXT: movq %rbx, %rax +; AVX512F-NEXT: mulq %r15 ; AVX512F-NEXT: movq %rdx, %rbp +; AVX512F-NEXT: movq %rax, %r10 +; AVX512F-NEXT: movq %r12, %rax +; AVX512F-NEXT: mulq %r15 +; AVX512F-NEXT: movq %rdx, %rsi ; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: addq %rsi, %rcx -; AVX512F-NEXT: adcq %rdi, %rbp +; AVX512F-NEXT: addq %rbp, %rcx +; AVX512F-NEXT: adcq $0, %rsi +; AVX512F-NEXT: movq %rbx, %rax +; AVX512F-NEXT: mulq %r8 +; AVX512F-NEXT: movq %rdx, %rbp +; AVX512F-NEXT: movq %rax, %rbx +; AVX512F-NEXT: addq %rcx, %rbx +; AVX512F-NEXT: adcq %rsi, %rbp ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: movzbl %al, %esi -; AVX512F-NEXT: movq %r14, %rax +; AVX512F-NEXT: movzbl %al, %ecx +; AVX512F-NEXT: movq %r12, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: addq %rbp, %rax -; AVX512F-NEXT: adcq %rsi, %rdx +; AVX512F-NEXT: adcq %rcx, %rdx ; AVX512F-NEXT: addq %r13, %rax -; AVX512F-NEXT: adcq %rbx, %rdx -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512F-NEXT: movq %rcx, 24(%r8) -; AVX512F-NEXT: sarq $63, %rcx -; AVX512F-NEXT: xorq %rcx, %rdx -; AVX512F-NEXT: xorq %rax, %rcx -; AVX512F-NEXT: orq %rdx, %rcx +; AVX512F-NEXT: adcq %rdi, %rdx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX512F-NEXT: movq %rbx, 24(%r13) +; AVX512F-NEXT: sarq $63, %rbx +; AVX512F-NEXT: xorq %rbx, %rdx +; AVX512F-NEXT: xorq %rax, %rbx +; AVX512F-NEXT: orq %rdx, %rbx ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: kmovw %eax, %k0 ; AVX512F-NEXT: movq %r11, %rdi ; AVX512F-NEXT: sarq $63, %rdi -; AVX512F-NEXT: movq %r9, %rsi -; AVX512F-NEXT: imulq %rdi, %rsi +; AVX512F-NEXT: movq %rdi, %rbp +; AVX512F-NEXT: andq %r9, %rbp ; AVX512F-NEXT: movq %r9, %rax ; AVX512F-NEXT: mulq %rdi ; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: addq %rsi, %rdx -; AVX512F-NEXT: imulq %r10, %rdi -; AVX512F-NEXT: addq %rdx, %rdi -; AVX512F-NEXT: movq %r10, %rsi -; AVX512F-NEXT: sarq $63, %rsi -; AVX512F-NEXT: movq %rsi, %rbp -; AVX512F-NEXT: imulq %r11, %rbp -; AVX512F-NEXT: movq %rsi, %rax -; AVX512F-NEXT: mulq %r15 +; AVX512F-NEXT: movq %rdx, %rsi +; AVX512F-NEXT: subq %rbp, %rsi +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512F-NEXT: andq %rax, %rdi +; AVX512F-NEXT: subq %rdi, %rsi +; AVX512F-NEXT: movq %rax, %rdi +; AVX512F-NEXT: movq %rax, %r8 +; AVX512F-NEXT: sarq $63, %rdi +; AVX512F-NEXT: movq %rdi, %rbx +; AVX512F-NEXT: andq %r11, %rbx +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: mulq %r14 ; AVX512F-NEXT: movq %rax, %r12 -; AVX512F-NEXT: addq %rbp, %rdx -; AVX512F-NEXT: imulq %r15, %rsi -; AVX512F-NEXT: addq %rdx, %rsi +; AVX512F-NEXT: movq %rdx, %rbp +; AVX512F-NEXT: subq %rbx, %rbp +; AVX512F-NEXT: andq %r14, %rdi +; AVX512F-NEXT: subq %rdi, %rbp ; AVX512F-NEXT: addq %rcx, %r12 -; AVX512F-NEXT: adcq %rdi, %rsi -; AVX512F-NEXT: movq %r15, %rax +; AVX512F-NEXT: adcq %rsi, %rbp +; AVX512F-NEXT: movq %r14, %rax ; AVX512F-NEXT: mulq %r9 ; AVX512F-NEXT: movq %rdx, %rcx -; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: movq %rax, %r15 ; AVX512F-NEXT: movq %r11, %rax ; AVX512F-NEXT: mulq %r9 -; AVX512F-NEXT: movq %rdx, %rbp -; AVX512F-NEXT: movq %rax, %rbx -; AVX512F-NEXT: addq %rcx, %rbx -; AVX512F-NEXT: adcq $0, %rbp -; AVX512F-NEXT: movq %r15, %rax -; AVX512F-NEXT: mulq %r10 +; AVX512F-NEXT: movq %rdx, %rdi +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: addq %rcx, %rsi +; AVX512F-NEXT: adcq $0, %rdi +; AVX512F-NEXT: movq %r14, %rax +; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %rcx -; AVX512F-NEXT: movq %rax, %rdi -; AVX512F-NEXT: addq %rbx, %rdi -; AVX512F-NEXT: adcq %rbp, %rcx +; AVX512F-NEXT: movq %rax, %rbx +; AVX512F-NEXT: addq %rsi, %rbx +; AVX512F-NEXT: adcq %rdi, %rcx ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: movzbl %al, %ebp +; AVX512F-NEXT: movzbl %al, %esi ; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %r10 +; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: addq %rcx, %rax -; AVX512F-NEXT: adcq %rbp, %rdx -; AVX512F-NEXT: addq %r12, %rax ; AVX512F-NEXT: adcq %rsi, %rdx -; AVX512F-NEXT: movq %rdi, 8(%r8) -; AVX512F-NEXT: sarq $63, %rdi -; AVX512F-NEXT: xorq %rdi, %rdx -; AVX512F-NEXT: xorq %rax, %rdi -; AVX512F-NEXT: orq %rdx, %rdi +; AVX512F-NEXT: addq %r12, %rax +; AVX512F-NEXT: adcq %rbp, %rdx +; AVX512F-NEXT: movq %rbx, 8(%r13) +; AVX512F-NEXT: sarq $63, %rbx +; AVX512F-NEXT: xorq %rbx, %rdx +; AVX512F-NEXT: xorq %rax, %rbx +; AVX512F-NEXT: orq %rdx, %rbx ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: kmovw %eax, %k1 @@ -3962,9 +3980,8 @@ ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512F-NEXT: movq %rax, 16(%r8) -; AVX512F-NEXT: movq %r14, (%r8) +; AVX512F-NEXT: movq %r10, 16(%r13) +; AVX512F-NEXT: movq %r15, (%r13) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r13 @@ -3981,119 +3998,125 @@ ; AVX512BW-NEXT: pushq %r13 ; AVX512BW-NEXT: pushq %r12 ; AVX512BW-NEXT: pushq %rbx -; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512BW-NEXT: movq %r8, %r9 -; AVX512BW-NEXT: movq %rcx, %r14 -; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: movq %rcx, %r12 +; AVX512BW-NEXT: movq %rdx, %rbx ; AVX512BW-NEXT: movq %rsi, %r11 -; AVX512BW-NEXT: movq %rdi, %r15 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512BW-NEXT: movq %rdi, %r14 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512BW-NEXT: movq %r14, %rdi -; AVX512BW-NEXT: sarq $63, %rdi -; AVX512BW-NEXT: movq %r12, %rbx -; AVX512BW-NEXT: imulq %rdi, %rbx -; AVX512BW-NEXT: movq %r12, %rax -; AVX512BW-NEXT: mulq %rdi -; AVX512BW-NEXT: movq %rax, %rsi -; AVX512BW-NEXT: addq %rbx, %rdx -; AVX512BW-NEXT: imulq %r8, %rdi -; AVX512BW-NEXT: addq %rdx, %rdi -; AVX512BW-NEXT: movq %r8, %rbx -; AVX512BW-NEXT: sarq $63, %rbx -; AVX512BW-NEXT: movq %rbx, %rbp -; AVX512BW-NEXT: imulq %r14, %rbp -; AVX512BW-NEXT: movq %rbx, %rax -; AVX512BW-NEXT: mulq %rcx +; AVX512BW-NEXT: movq %rcx, %rsi +; AVX512BW-NEXT: sarq $63, %rsi +; AVX512BW-NEXT: movq %rsi, %rdi +; AVX512BW-NEXT: andq %r15, %rdi +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: mulq %rsi +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: subq %rdi, %rcx +; AVX512BW-NEXT: andq %r8, %rsi +; AVX512BW-NEXT: subq %rsi, %rcx +; AVX512BW-NEXT: movq %r8, %rsi +; AVX512BW-NEXT: sarq $63, %rsi +; AVX512BW-NEXT: movq %rsi, %rbp +; AVX512BW-NEXT: andq %r12, %rbp +; AVX512BW-NEXT: movq %rsi, %rax +; AVX512BW-NEXT: mulq %rbx ; AVX512BW-NEXT: movq %rax, %r13 -; AVX512BW-NEXT: addq %rbp, %rdx -; AVX512BW-NEXT: imulq %rcx, %rbx -; AVX512BW-NEXT: addq %rdx, %rbx -; AVX512BW-NEXT: addq %rsi, %r13 -; AVX512BW-NEXT: adcq %rdi, %rbx -; AVX512BW-NEXT: movq %rcx, %rax -; AVX512BW-NEXT: mulq %r12 -; AVX512BW-NEXT: movq %rdx, %rbp -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512BW-NEXT: movq %r14, %rax -; AVX512BW-NEXT: mulq %r12 ; AVX512BW-NEXT: movq %rdx, %rdi -; AVX512BW-NEXT: movq %rax, %rsi -; AVX512BW-NEXT: addq %rbp, %rsi -; AVX512BW-NEXT: adcq $0, %rdi -; AVX512BW-NEXT: movq %rcx, %rax -; AVX512BW-NEXT: mulq %r8 +; AVX512BW-NEXT: subq %rbp, %rdi +; AVX512BW-NEXT: andq %rbx, %rsi +; AVX512BW-NEXT: subq %rsi, %rdi +; AVX512BW-NEXT: addq %r10, %r13 +; AVX512BW-NEXT: adcq %rcx, %rdi +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: mulq %r15 ; AVX512BW-NEXT: movq %rdx, %rbp +; AVX512BW-NEXT: movq %rax, %r10 +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: mulq %r15 +; AVX512BW-NEXT: movq %rdx, %rsi ; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: addq %rsi, %rcx -; AVX512BW-NEXT: adcq %rdi, %rbp +; AVX512BW-NEXT: addq %rbp, %rcx +; AVX512BW-NEXT: adcq $0, %rsi +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: mulq %r8 +; AVX512BW-NEXT: movq %rdx, %rbp +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: addq %rcx, %rbx +; AVX512BW-NEXT: adcq %rsi, %rbp ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: movzbl %al, %esi -; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: movzbl %al, %ecx +; AVX512BW-NEXT: movq %r12, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: addq %rbp, %rax -; AVX512BW-NEXT: adcq %rsi, %rdx +; AVX512BW-NEXT: adcq %rcx, %rdx ; AVX512BW-NEXT: addq %r13, %rax -; AVX512BW-NEXT: adcq %rbx, %rdx -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512BW-NEXT: movq %rcx, 24(%r8) -; AVX512BW-NEXT: sarq $63, %rcx -; AVX512BW-NEXT: xorq %rcx, %rdx -; AVX512BW-NEXT: xorq %rax, %rcx -; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: adcq %rdi, %rdx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX512BW-NEXT: movq %rbx, 24(%r13) +; AVX512BW-NEXT: sarq $63, %rbx +; AVX512BW-NEXT: xorq %rbx, %rdx +; AVX512BW-NEXT: xorq %rax, %rbx +; AVX512BW-NEXT: orq %rdx, %rbx ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: movq %r11, %rdi ; AVX512BW-NEXT: sarq $63, %rdi -; AVX512BW-NEXT: movq %r9, %rsi -; AVX512BW-NEXT: imulq %rdi, %rsi +; AVX512BW-NEXT: movq %rdi, %rbp +; AVX512BW-NEXT: andq %r9, %rbp ; AVX512BW-NEXT: movq %r9, %rax ; AVX512BW-NEXT: mulq %rdi ; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: addq %rsi, %rdx -; AVX512BW-NEXT: imulq %r10, %rdi -; AVX512BW-NEXT: addq %rdx, %rdi -; AVX512BW-NEXT: movq %r10, %rsi -; AVX512BW-NEXT: sarq $63, %rsi -; AVX512BW-NEXT: movq %rsi, %rbp -; AVX512BW-NEXT: imulq %r11, %rbp -; AVX512BW-NEXT: movq %rsi, %rax -; AVX512BW-NEXT: mulq %r15 +; AVX512BW-NEXT: movq %rdx, %rsi +; AVX512BW-NEXT: subq %rbp, %rsi +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: andq %rax, %rdi +; AVX512BW-NEXT: subq %rdi, %rsi +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: movq %rax, %r8 +; AVX512BW-NEXT: sarq $63, %rdi +; AVX512BW-NEXT: movq %rdi, %rbx +; AVX512BW-NEXT: andq %r11, %rbx +; AVX512BW-NEXT: movq %rdi, %rax +; AVX512BW-NEXT: mulq %r14 ; AVX512BW-NEXT: movq %rax, %r12 -; AVX512BW-NEXT: addq %rbp, %rdx -; AVX512BW-NEXT: imulq %r15, %rsi -; AVX512BW-NEXT: addq %rdx, %rsi +; AVX512BW-NEXT: movq %rdx, %rbp +; AVX512BW-NEXT: subq %rbx, %rbp +; AVX512BW-NEXT: andq %r14, %rdi +; AVX512BW-NEXT: subq %rdi, %rbp ; AVX512BW-NEXT: addq %rcx, %r12 -; AVX512BW-NEXT: adcq %rdi, %rsi -; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: adcq %rsi, %rbp +; AVX512BW-NEXT: movq %r14, %rax ; AVX512BW-NEXT: mulq %r9 ; AVX512BW-NEXT: movq %rdx, %rcx -; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: movq %rax, %r15 ; AVX512BW-NEXT: movq %r11, %rax ; AVX512BW-NEXT: mulq %r9 -; AVX512BW-NEXT: movq %rdx, %rbp -; AVX512BW-NEXT: movq %rax, %rbx -; AVX512BW-NEXT: addq %rcx, %rbx -; AVX512BW-NEXT: adcq $0, %rbp -; AVX512BW-NEXT: movq %r15, %rax -; AVX512BW-NEXT: mulq %r10 +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: addq %rcx, %rsi +; AVX512BW-NEXT: adcq $0, %rdi +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %rcx -; AVX512BW-NEXT: movq %rax, %rdi -; AVX512BW-NEXT: addq %rbx, %rdi -; AVX512BW-NEXT: adcq %rbp, %rcx +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: addq %rsi, %rbx +; AVX512BW-NEXT: adcq %rdi, %rcx ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: movzbl %al, %ebp +; AVX512BW-NEXT: movzbl %al, %esi ; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %r10 +; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: addq %rcx, %rax -; AVX512BW-NEXT: adcq %rbp, %rdx -; AVX512BW-NEXT: addq %r12, %rax ; AVX512BW-NEXT: adcq %rsi, %rdx -; AVX512BW-NEXT: movq %rdi, 8(%r8) -; AVX512BW-NEXT: sarq $63, %rdi -; AVX512BW-NEXT: xorq %rdi, %rdx -; AVX512BW-NEXT: xorq %rax, %rdi -; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: addq %r12, %rax +; AVX512BW-NEXT: adcq %rbp, %rdx +; AVX512BW-NEXT: movq %rbx, 8(%r13) +; AVX512BW-NEXT: sarq $63, %rbx +; AVX512BW-NEXT: xorq %rbx, %rdx +; AVX512BW-NEXT: xorq %rax, %rbx +; AVX512BW-NEXT: orq %rdx, %rbx ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: andl $1, %eax ; AVX512BW-NEXT: kmovw %eax, %k1 @@ -4101,9 +4124,8 @@ ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: movq %rax, 16(%r8) -; AVX512BW-NEXT: movq %r14, (%r8) +; AVX512BW-NEXT: movq %r10, 16(%r13) +; AVX512BW-NEXT: movq %r15, (%r13) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 ; AVX512BW-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll --- a/llvm/test/CodeGen/X86/xmulo.ll +++ b/llvm/test/CodeGen/X86/xmulo.ll @@ -215,35 +215,36 @@ ; WIN32-NEXT: subl $8, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: sarl $31, %edi -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: imull %edi, %esi -; WIN32-NEXT: mull %edi -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: addl %esi, %edx -; WIN32-NEXT: movl %ebp, %esi -; WIN32-NEXT: imull %ebp, %edi -; WIN32-NEXT: addl %edx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %ebx, %esi ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl %esi, %ebp -; WIN32-NEXT: imull %ecx, %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: addl %ebp, %edx -; WIN32-NEXT: imull %ecx, %esi -; WIN32-NEXT: addl %edx, %esi -; WIN32-NEXT: addl %ebx, %eax +; WIN32-NEXT: movl %esi, %edi +; WIN32-NEXT: andl %eax, %edi +; WIN32-NEXT: mull %esi ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %edi, %esi -; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: subl %edi, %ecx +; WIN32-NEXT: andl %ebp, %esi +; WIN32-NEXT: subl %esi, %ecx +; WIN32-NEXT: sarl $31, %ebp +; WIN32-NEXT: movl %ebp, %edi +; WIN32-NEXT: andl %ebx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull %ebx +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: subl %edi, %esi +; WIN32-NEXT: andl %ebx, %ebp +; WIN32-NEXT: subl %ebp, %esi +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: adcl %ecx, %esi +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: movl %ebx, %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebx @@ -262,7 +263,7 @@ ; WIN32-NEXT: addl %edi, %eax ; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; WIN32-NEXT: adcl %esi, %edx ; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: sarl $31, %ecx @@ -271,7 +272,7 @@ ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %ebp, 4(%eax) -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al ; WIN32-NEXT: addl $8, %esp @@ -573,49 +574,52 @@ ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: pushl %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ebp, %edi -; WIN32-NEXT: imull %ecx, %edi -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: andl %eax, %edi ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: addl %edi, %edx -; WIN32-NEXT: imull %ebx, %ecx -; WIN32-NEXT: addl %edx, %ecx -; WIN32-NEXT: sarl $31, %ebx -; WIN32-NEXT: movl %ebx, %edi -; WIN32-NEXT: imull %esi, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: mull %esi -; WIN32-NEXT: addl %edi, %edx -; WIN32-NEXT: movl %esi, %edi -; WIN32-NEXT: imull %esi, %ebx -; WIN32-NEXT: addl %edx, %ebx +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: subl %edi, %esi +; WIN32-NEXT: andl %ebx, %ecx +; WIN32-NEXT: subl %ecx, %esi +; WIN32-NEXT: movl %ebx, %ecx +; WIN32-NEXT: sarl $31, %ecx +; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: andl %ebp, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: subl %edi, %ebx +; WIN32-NEXT: movl %ebp, %edi +; WIN32-NEXT: andl %ebp, %ecx +; WIN32-NEXT: subl %ecx, %ebx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ecx, %ebx +; WIN32-NEXT: adcl %esi, %ebx ; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %ebp -; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %esi, %ebp -; WIN32-NEXT: adcl $0, %ecx +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl %esi, %ecx +; WIN32-NEXT: adcl $0, %ebp ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %ebp, %esi +; WIN32-NEXT: addl %ecx, %esi +; WIN32-NEXT: adcl %ebp, %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: adcl %ecx, %edi ; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) @@ -999,30 +1003,32 @@ ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: sarl $31, %edi -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: imull %edi, %esi -; WIN32-NEXT: mull %edi -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %esi, %edx -; WIN32-NEXT: movl %ebx, %esi -; WIN32-NEXT: imull %ebx, %edi -; WIN32-NEXT: addl %edx, %edi +; WIN32-NEXT: movl %ecx, %esi +; WIN32-NEXT: movl %ecx, %ebp ; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl %esi, %ebx -; WIN32-NEXT: imull %ecx, %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: addl %ebx, %edx -; WIN32-NEXT: imull %ecx, %esi -; WIN32-NEXT: addl %edx, %esi -; WIN32-NEXT: addl %ebp, %eax +; WIN32-NEXT: movl %esi, %edi +; WIN32-NEXT: andl %eax, %edi +; WIN32-NEXT: mull %esi ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %edi, %esi -; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: movl %ecx, %edi +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: subl %edi, %ecx +; WIN32-NEXT: andl %ebx, %esi +; WIN32-NEXT: subl %esi, %ecx +; WIN32-NEXT: sarl $31, %ebx +; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: andl %ebp, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: subl %edi, %esi +; WIN32-NEXT: andl %ebp, %ebx +; WIN32-NEXT: subl %ebx, %esi +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: adcl %ecx, %esi +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl %ebp, %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebx @@ -1704,57 +1710,62 @@ ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: subl $16, %esp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl (%eax), %esi -; WIN32-NEXT: movl 4(%eax), %ebp -; WIN32-NEXT: sarl $31, %ebx -; WIN32-NEXT: movl %ebx, %ecx -; WIN32-NEXT: imull %ebp, %ecx -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl 4(%eax), %eax +; WIN32-NEXT: sarl $31, %edi +; WIN32-NEXT: movl %edi, %ecx +; WIN32-NEXT: andl %eax, %ecx +; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: addl %ecx, %edx +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: mull %esi +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: subl %ecx, %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %esi, %ebx -; WIN32-NEXT: addl %edx, %ebx -; WIN32-NEXT: movl %ebp, %ecx -; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: andl %esi, %edi +; WIN32-NEXT: subl %edi, %ebp +; WIN32-NEXT: movl %ebx, %ecx ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: imull %ecx, %edi +; WIN32-NEXT: movl %ecx, %ebx +; WIN32-NEXT: andl %eax, %ebx ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: addl %edi, %edx -; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: addl %edx, %ecx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: subl %ebx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: andl %edx, %ecx +; WIN32-NEXT: subl %ecx, %edi +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %ebx, %ecx +; WIN32-NEXT: adcl %ebp, %edi ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: mull %edi -; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: addl %ebx, %edi +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; WIN32-NEXT: adcl $0, %ebp ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %edi, %esi -; WIN32-NEXT: adcl %ebp, %ebx -; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; WIN32-NEXT: addl %ebx, %esi +; WIN32-NEXT: adcl %ebp, %ecx +; WIN32-NEXT: setb %bl ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %ebx, %eax -; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; WIN32-NEXT: adcl %edi, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: movzbl %bl, %ecx ; WIN32-NEXT: adcl %ecx, %edx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: adcl %edi, %edx ; WIN32-NEXT: movl %esi, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx @@ -1762,7 +1773,7 @@ ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %esi, 4(%eax) -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al ; WIN32-NEXT: addl $16, %esp @@ -1810,35 +1821,35 @@ ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: subl $12, %esp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl (%eax), %ebp ; WIN32-NEXT: movl 4(%eax), %ebx -; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: sarl $31, %edi -; WIN32-NEXT: movl %ebp, %esi -; WIN32-NEXT: imull %edi, %esi +; WIN32-NEXT: sarl $31, %esi +; WIN32-NEXT: movl %esi, %edi +; WIN32-NEXT: andl %ebp, %edi ; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull %edi +; WIN32-NEXT: mull %esi ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: addl %esi, %edx -; WIN32-NEXT: movl %ebx, %esi +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: subl %edi, %ecx ; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: imull %ebx, %edi -; WIN32-NEXT: addl %edx, %edi -; WIN32-NEXT: sarl $31, %esi -; WIN32-NEXT: movl %esi, %ebx -; WIN32-NEXT: imull %ecx, %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: addl %ebx, %edx -; WIN32-NEXT: imull %ecx, %esi -; WIN32-NEXT: addl %edx, %esi +; WIN32-NEXT: andl %ebx, %esi +; WIN32-NEXT: subl %esi, %ecx +; WIN32-NEXT: sarl $31, %ebx +; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: andl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: subl %edi, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: andl %edx, %ebx +; WIN32-NEXT: subl %ebx, %esi ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %edi, %esi -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: adcl %ecx, %esi +; WIN32-NEXT: movl %edx, %eax ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %edi ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill