diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -209,9 +209,6 @@ DenseMap &InstrIdxForVirtReg, MachineTraceMetrics::Trace BlockTrace) { SmallVector InstrDepth; - assert(TSchedModel.hasInstrSchedModelOrItineraries() && - "Missing machine model\n"); - // For each instruction in the new sequence compute the depth based on the // operands. Use the trace information when possible. For new operands which // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth @@ -267,9 +264,6 @@ /// \returns Latency of \p NewRoot unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot, MachineTraceMetrics::Trace BlockTrace) { - assert(TSchedModel.hasInstrSchedModelOrItineraries() && - "Missing machine model\n"); - // Check each definition in NewRoot and compute the latency unsigned NewRootLatency = 0; @@ -379,8 +373,6 @@ DenseMap &InstrIdxForVirtReg, MachineCombinerPattern Pattern, bool SlackIsAccurate) { - assert(TSchedModel.hasInstrSchedModelOrItineraries() && - "Missing machine model\n"); // Get depth and latency of NewRoot and Root. unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace); unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth; @@ -696,13 +688,6 @@ // Eagerly stop after the first pattern fires. Changed = true; break; - } else if (!TSchedModel.hasInstrSchedModelOrItineraries()) { - LLVM_DEBUG(dbgs() << "\t Replacing due to lack of schedule model\n"); - insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr, - RegUnits, TII, P, IncrementalUpdate); - // Eagerly stop after the first pattern fires. - Changed = true; - break; } else { // For big basic blocks, we only compute the full trace the first time // we hit this. We do not invalidate the trace, but instead update the diff --git a/llvm/test/CodeGen/RISCV/addc-adde-sube-subc.ll b/llvm/test/CodeGen/RISCV/addc-adde-sube-subc.ll --- a/llvm/test/CodeGen/RISCV/addc-adde-sube-subc.ll +++ b/llvm/test/CodeGen/RISCV/addc-adde-sube-subc.ll @@ -7,9 +7,9 @@ define i64 @addc_adde(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: addc_adde: ; RV32I: # %bb.0: +; RV32I-NEXT: add a1, a1, a3 ; RV32I-NEXT: add a2, a0, a2 ; RV32I-NEXT: sltu a0, a2, a0 -; RV32I-NEXT: add a0, a3, a0 ; RV32I-NEXT: add a1, a1, a0 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret @@ -21,8 +21,8 @@ ; RV32I-LABEL: subc_sube: ; RV32I: # %bb.0: ; RV32I-NEXT: sltu a4, a0, a2 -; RV32I-NEXT: add a3, a3, a4 ; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret %1 = sub i64 %a, %b diff --git a/llvm/test/CodeGen/RISCV/addcarry.ll b/llvm/test/CodeGen/RISCV/addcarry.ll --- a/llvm/test/CodeGen/RISCV/addcarry.ll +++ b/llvm/test/CodeGen/RISCV/addcarry.ll @@ -19,11 +19,11 @@ ; RISCV32-NEXT: sltu a5, a6, a5 ; RISCV32-NEXT: mulhu a6, a0, a3 ; RISCV32-NEXT: mulhu t0, a1, a2 -; RISCV32-NEXT: add a5, a5, t0 -; RISCV32-NEXT: add a5, a5, a7 -; RISCV32-NEXT: mul a7, a1, a3 -; RISCV32-NEXT: add a5, a5, a7 +; RISCV32-NEXT: add a6, a6, t0 ; RISCV32-NEXT: add a5, a6, a5 +; RISCV32-NEXT: add a5, a5, a7 +; RISCV32-NEXT: mul a6, a1, a3 +; RISCV32-NEXT: add a5, a5, a6 ; RISCV32-NEXT: bgez a1, .LBB0_2 ; RISCV32-NEXT: # %bb.1: ; RISCV32-NEXT: sub a5, a5, a2 diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll --- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll +++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll @@ -51,11 +51,11 @@ ; RV32IMB-NEXT: li a2, 29 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: addi a0, a2, 1073 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: ret ; ; RV64IMB-LABEL: add_mul_combine_accept_a3: @@ -121,13 +121,13 @@ ; RV32IMB-NEXT: li a2, 23 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 50 ; RV32IMB-NEXT: addi a0, a0, 1119 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: ret ; ; RV64IMB-LABEL: add_mul_combine_accept_b3: @@ -187,13 +187,13 @@ ; RV32IMB-NEXT: li a2, 29 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 14 ; RV32IMB-NEXT: addi a0, a0, -185 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: ret ; ; RV64IMB-LABEL: add_mul_combine_reject_a3: @@ -253,13 +253,13 @@ ; RV32IMB-NEXT: li a2, 73 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 18 ; RV32IMB-NEXT: addi a0, a0, -728 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: ret ; ; RV64IMB-LABEL: add_mul_combine_reject_c3: @@ -318,14 +318,14 @@ ; RV32IMB-NEXT: mulhu a2, a0, a2 ; RV32IMB-NEXT: sh1add a1, a1, a1 ; RV32IMB-NEXT: slli a1, a1, 6 +; RV32IMB-NEXT: add a1, a2, a1 ; RV32IMB-NEXT: sh1add a0, a0, a0 -; RV32IMB-NEXT: slli a3, a0, 6 +; RV32IMB-NEXT: slli a2, a0, 6 ; RV32IMB-NEXT: lui a0, 47 ; RV32IMB-NEXT: addi a0, a0, -512 -; RV32IMB-NEXT: add a0, a3, a0 -; RV32IMB-NEXT: sltu a3, a0, a3 -; RV32IMB-NEXT: add a1, a1, a3 -; RV32IMB-NEXT: add a1, a2, a1 +; RV32IMB-NEXT: add a0, a2, a0 +; RV32IMB-NEXT: sltu a2, a0, a2 +; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; ; RV64IMB-LABEL: add_mul_combine_reject_d3: @@ -383,13 +383,13 @@ ; RV32IMB-NEXT: li a2, 29 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 14 ; RV32IMB-NEXT: addi a0, a0, -185 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: ret ; ; RV64IMB-LABEL: add_mul_combine_reject_e3: @@ -451,13 +451,13 @@ ; RV32IMB-NEXT: li a2, 29 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 14 ; RV32IMB-NEXT: addi a0, a0, -145 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: ret ; ; RV64IMB-LABEL: add_mul_combine_reject_f3: @@ -520,13 +520,13 @@ ; RV32IMB-NEXT: li a2, 73 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 2 ; RV32IMB-NEXT: addi a0, a0, -882 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: ret ; ; RV64IMB-LABEL: add_mul_combine_reject_g3: @@ -622,13 +622,13 @@ ; RV32IMB-NEXT: addi a2, a2, -1096 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 2 ; RV32IMB-NEXT: addi a0, a0, 798 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: ret ; ; RV64IMB-LABEL: mul3000_add8990_c: @@ -697,13 +697,13 @@ ; RV32IMB-NEXT: addi a2, a2, -1096 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 1048574 ; RV32IMB-NEXT: addi a0, a0, -798 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 ; RV32IMB-NEXT: add a1, a1, a2 -; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: addi a1, a1, -1 ; RV32IMB-NEXT: ret ; @@ -773,14 +773,14 @@ ; RV32IMB-NEXT: addi a2, a2, 1096 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 -; RV32IMB-NEXT: sub a1, a0, a1 +; RV32IMB-NEXT: sub a3, a3, a0 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 2 ; RV32IMB-NEXT: addi a0, a0, 798 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 -; RV32IMB-NEXT: sub a1, a1, a2 -; RV32IMB-NEXT: sub a1, a3, a1 +; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: ret ; ; RV64IMB-LABEL: mulneg3000_add8990_c: @@ -849,14 +849,14 @@ ; RV32IMB-NEXT: addi a2, a2, 1096 ; RV32IMB-NEXT: mul a1, a1, a2 ; RV32IMB-NEXT: mulhu a3, a0, a2 -; RV32IMB-NEXT: sub a1, a0, a1 +; RV32IMB-NEXT: sub a3, a3, a0 +; RV32IMB-NEXT: add a1, a3, a1 ; RV32IMB-NEXT: mul a2, a0, a2 ; RV32IMB-NEXT: lui a0, 1048574 ; RV32IMB-NEXT: addi a0, a0, -798 ; RV32IMB-NEXT: add a0, a2, a0 ; RV32IMB-NEXT: sltu a2, a0, a2 -; RV32IMB-NEXT: sub a1, a1, a2 -; RV32IMB-NEXT: sub a1, a3, a1 +; RV32IMB-NEXT: add a1, a1, a2 ; RV32IMB-NEXT: addi a1, a1, -1 ; RV32IMB-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll --- a/llvm/test/CodeGen/RISCV/alu64.ll +++ b/llvm/test/CodeGen/RISCV/alu64.ll @@ -172,9 +172,9 @@ ; ; RV32I-LABEL: add: ; RV32I: # %bb.0: +; RV32I-NEXT: add a1, a1, a3 ; RV32I-NEXT: add a2, a0, a2 ; RV32I-NEXT: sltu a0, a2, a0 -; RV32I-NEXT: add a0, a3, a0 ; RV32I-NEXT: add a1, a1, a0 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret @@ -191,8 +191,8 @@ ; RV32I-LABEL: sub: ; RV32I: # %bb.0: ; RV32I-NEXT: sltu a4, a0, a2 -; RV32I-NEXT: add a3, a3, a4 ; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret %1 = sub i64 %a, %b diff --git a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll --- a/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/bswap-bitreverse.ll @@ -64,7 +64,7 @@ ; RV32I-NEXT: and a2, a0, a2 ; RV32I-NEXT: slli a2, a2, 8 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret ; @@ -79,7 +79,7 @@ ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: slliw a0, a0, 24 -; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; @@ -109,7 +109,7 @@ ; RV32I-NEXT: and a4, a1, a3 ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a2, a4, a2 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: or a2, a1, a2 ; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: and a1, a1, a3 @@ -118,7 +118,7 @@ ; RV32I-NEXT: and a3, a0, a3 ; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret @@ -137,8 +137,8 @@ ; RV64I-NEXT: srli a5, a0, 8 ; RV64I-NEXT: srliw a5, a5, 24 ; RV64I-NEXT: slli a5, a5, 24 +; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: or a1, a5, a1 ; RV64I-NEXT: and a4, a0, a4 ; RV64I-NEXT: slli a4, a4, 24 ; RV64I-NEXT: srliw a3, a0, 24 @@ -147,8 +147,8 @@ ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: slli a2, a2, 40 ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: or a2, a2, a3 -; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; @@ -401,7 +401,7 @@ ; RV32I-NEXT: and a2, a0, a2 ; RV32I-NEXT: slli a2, a2, 8 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: lui a2, 61681 @@ -437,7 +437,7 @@ ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: slliw a0, a0, 24 -; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 4 ; RV64I-NEXT: lui a2, 61681 @@ -545,7 +545,7 @@ ; RV32I-NEXT: and a4, a1, a3 ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a2, a4, a2 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: srli a2, a1, 4 ; RV32I-NEXT: lui a4, 61681 @@ -575,7 +575,7 @@ ; RV32I-NEXT: and a3, a0, a3 ; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: srli a1, a0, 4 ; RV32I-NEXT: and a1, a1, a4 @@ -609,8 +609,8 @@ ; RV64I-NEXT: srli a5, a0, 8 ; RV64I-NEXT: srliw a5, a5, 24 ; RV64I-NEXT: slli a5, a5, 24 +; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: or a1, a5, a1 ; RV64I-NEXT: and a4, a0, a4 ; RV64I-NEXT: slli a4, a4, 24 ; RV64I-NEXT: srliw a3, a0, 24 @@ -619,14 +619,14 @@ ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: slli a2, a2, 40 ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: or a2, a2, a3 -; RV64I-NEXT: lui a3, %hi(.LCPI6_0) -; RV64I-NEXT: ld a3, %lo(.LCPI6_0)(a3) -; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: lui a2, %hi(.LCPI6_0) +; RV64I-NEXT: ld a2, %lo(.LCPI6_0)(a2) +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srli a1, a0, 4 -; RV64I-NEXT: and a1, a1, a3 -; RV64I-NEXT: and a0, a0, a3 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: and a0, a0, a2 ; RV64I-NEXT: lui a2, %hi(.LCPI6_1) ; RV64I-NEXT: ld a2, %lo(.LCPI6_1)(a2) ; RV64I-NEXT: slli a0, a0, 4 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-common.ll @@ -94,15 +94,15 @@ ; RV32I-FPELIM-LABEL: callee_aligned_stack: ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: lw a0, 0(a2) -; RV32I-FPELIM-NEXT: lw a1, 0(sp) -; RV32I-FPELIM-NEXT: lw a2, 8(sp) -; RV32I-FPELIM-NEXT: lw a3, 16(sp) -; RV32I-FPELIM-NEXT: lw a4, 20(sp) -; RV32I-FPELIM-NEXT: add a1, a7, a1 -; RV32I-FPELIM-NEXT: add a1, a1, a2 -; RV32I-FPELIM-NEXT: add a1, a1, a3 -; RV32I-FPELIM-NEXT: add a1, a1, a4 +; RV32I-FPELIM-NEXT: lw a1, 8(sp) +; RV32I-FPELIM-NEXT: lw a2, 0(sp) +; RV32I-FPELIM-NEXT: lw a3, 20(sp) +; RV32I-FPELIM-NEXT: lw a4, 16(sp) +; RV32I-FPELIM-NEXT: add a0, a0, a7 +; RV32I-FPELIM-NEXT: add a1, a2, a1 ; RV32I-FPELIM-NEXT: add a0, a0, a1 +; RV32I-FPELIM-NEXT: add a3, a4, a3 +; RV32I-FPELIM-NEXT: add a0, a0, a3 ; RV32I-FPELIM-NEXT: ret ; ; RV32I-WITHFP-LABEL: callee_aligned_stack: @@ -112,15 +112,15 @@ ; RV32I-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 16 ; RV32I-WITHFP-NEXT: lw a0, 0(a2) -; RV32I-WITHFP-NEXT: lw a1, 0(s0) -; RV32I-WITHFP-NEXT: lw a2, 8(s0) -; RV32I-WITHFP-NEXT: lw a3, 16(s0) -; RV32I-WITHFP-NEXT: lw a4, 20(s0) -; RV32I-WITHFP-NEXT: add a1, a7, a1 -; RV32I-WITHFP-NEXT: add a1, a1, a2 -; RV32I-WITHFP-NEXT: add a1, a1, a3 -; RV32I-WITHFP-NEXT: add a1, a1, a4 +; RV32I-WITHFP-NEXT: lw a1, 8(s0) +; RV32I-WITHFP-NEXT: lw a2, 0(s0) +; RV32I-WITHFP-NEXT: lw a3, 20(s0) +; RV32I-WITHFP-NEXT: lw a4, 16(s0) +; RV32I-WITHFP-NEXT: add a0, a0, a7 +; RV32I-WITHFP-NEXT: add a1, a2, a1 ; RV32I-WITHFP-NEXT: add a0, a0, a1 +; RV32I-WITHFP-NEXT: add a3, a4, a3 +; RV32I-WITHFP-NEXT: add a0, a0, a3 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll --- a/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-ilp32-ilp32f-ilp32d-common.ll @@ -87,12 +87,12 @@ ; RV32I-FPELIM-NEXT: andi a0, a0, 255 ; RV32I-FPELIM-NEXT: slli a1, a1, 16 ; RV32I-FPELIM-NEXT: srli a1, a1, 16 -; RV32I-FPELIM-NEXT: add a1, a1, a2 -; RV32I-FPELIM-NEXT: xor a2, a4, t1 -; RV32I-FPELIM-NEXT: xor a3, a3, a7 -; RV32I-FPELIM-NEXT: or a2, a3, a2 -; RV32I-FPELIM-NEXT: seqz a2, a2 -; RV32I-FPELIM-NEXT: add a1, a2, a1 +; RV32I-FPELIM-NEXT: add a0, a0, a2 +; RV32I-FPELIM-NEXT: add a0, a0, a1 +; RV32I-FPELIM-NEXT: xor a1, a4, t1 +; RV32I-FPELIM-NEXT: xor a2, a3, a7 +; RV32I-FPELIM-NEXT: or a1, a2, a1 +; RV32I-FPELIM-NEXT: seqz a1, a1 ; RV32I-FPELIM-NEXT: add a0, a0, a5 ; RV32I-FPELIM-NEXT: add a0, a0, a6 ; RV32I-FPELIM-NEXT: add a0, a0, t0 @@ -110,12 +110,12 @@ ; RV32I-WITHFP-NEXT: andi a0, a0, 255 ; RV32I-WITHFP-NEXT: slli a1, a1, 16 ; RV32I-WITHFP-NEXT: srli a1, a1, 16 -; RV32I-WITHFP-NEXT: add a1, a1, a2 -; RV32I-WITHFP-NEXT: xor a2, a4, t1 -; RV32I-WITHFP-NEXT: xor a3, a3, a7 -; RV32I-WITHFP-NEXT: or a2, a3, a2 -; RV32I-WITHFP-NEXT: seqz a2, a2 -; RV32I-WITHFP-NEXT: add a1, a2, a1 +; RV32I-WITHFP-NEXT: add a0, a0, a2 +; RV32I-WITHFP-NEXT: add a0, a0, a1 +; RV32I-WITHFP-NEXT: xor a1, a4, t1 +; RV32I-WITHFP-NEXT: xor a2, a3, a7 +; RV32I-WITHFP-NEXT: or a1, a2, a1 +; RV32I-WITHFP-NEXT: seqz a1, a1 ; RV32I-WITHFP-NEXT: add a0, a0, a5 ; RV32I-WITHFP-NEXT: add a0, a0, a6 ; RV32I-WITHFP-NEXT: add a0, a0, t0 @@ -203,8 +203,8 @@ ; RV32I-FPELIM-NEXT: or a4, a4, a5 ; RV32I-FPELIM-NEXT: xor a0, a0, a1 ; RV32I-FPELIM-NEXT: xor a2, a3, a2 -; RV32I-FPELIM-NEXT: or a0, a0, a4 ; RV32I-FPELIM-NEXT: or a0, a2, a0 +; RV32I-FPELIM-NEXT: or a0, a0, a4 ; RV32I-FPELIM-NEXT: seqz a0, a0 ; RV32I-FPELIM-NEXT: ret ; @@ -227,8 +227,8 @@ ; RV32I-WITHFP-NEXT: or a4, a4, a5 ; RV32I-WITHFP-NEXT: xor a0, a0, a1 ; RV32I-WITHFP-NEXT: xor a2, a3, a2 -; RV32I-WITHFP-NEXT: or a0, a0, a4 ; RV32I-WITHFP-NEXT: or a0, a2, a0 +; RV32I-WITHFP-NEXT: or a0, a0, a4 ; RV32I-WITHFP-NEXT: seqz a0, a0 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -310,8 +310,8 @@ ; RV32I-FPELIM-NEXT: or a3, a3, a4 ; RV32I-FPELIM-NEXT: xor a0, a7, a0 ; RV32I-FPELIM-NEXT: xor a1, a2, a1 -; RV32I-FPELIM-NEXT: or a0, a0, a3 ; RV32I-FPELIM-NEXT: or a0, a1, a0 +; RV32I-FPELIM-NEXT: or a0, a0, a3 ; RV32I-FPELIM-NEXT: seqz a0, a0 ; RV32I-FPELIM-NEXT: ret ; @@ -335,8 +335,8 @@ ; RV32I-WITHFP-NEXT: or a3, a3, a4 ; RV32I-WITHFP-NEXT: xor a0, a7, a0 ; RV32I-WITHFP-NEXT: xor a1, a2, a1 -; RV32I-WITHFP-NEXT: or a0, a0, a3 ; RV32I-WITHFP-NEXT: or a0, a1, a0 +; RV32I-WITHFP-NEXT: or a0, a0, a3 ; RV32I-WITHFP-NEXT: seqz a0, a0 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -614,15 +614,15 @@ ; RV32I-FPELIM-LABEL: callee_aligned_stack: ; RV32I-FPELIM: # %bb.0: ; RV32I-FPELIM-NEXT: lw a0, 0(a2) -; RV32I-FPELIM-NEXT: lw a1, 0(sp) -; RV32I-FPELIM-NEXT: lw a2, 8(sp) -; RV32I-FPELIM-NEXT: lw a3, 16(sp) -; RV32I-FPELIM-NEXT: lw a4, 20(sp) -; RV32I-FPELIM-NEXT: add a1, a7, a1 -; RV32I-FPELIM-NEXT: add a1, a1, a2 -; RV32I-FPELIM-NEXT: add a1, a1, a3 -; RV32I-FPELIM-NEXT: add a1, a1, a4 +; RV32I-FPELIM-NEXT: lw a1, 8(sp) +; RV32I-FPELIM-NEXT: lw a2, 0(sp) +; RV32I-FPELIM-NEXT: lw a3, 20(sp) +; RV32I-FPELIM-NEXT: lw a4, 16(sp) +; RV32I-FPELIM-NEXT: add a0, a0, a7 +; RV32I-FPELIM-NEXT: add a1, a2, a1 ; RV32I-FPELIM-NEXT: add a0, a0, a1 +; RV32I-FPELIM-NEXT: add a3, a4, a3 +; RV32I-FPELIM-NEXT: add a0, a0, a3 ; RV32I-FPELIM-NEXT: ret ; ; RV32I-WITHFP-LABEL: callee_aligned_stack: @@ -632,15 +632,15 @@ ; RV32I-WITHFP-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-WITHFP-NEXT: addi s0, sp, 16 ; RV32I-WITHFP-NEXT: lw a0, 0(a2) -; RV32I-WITHFP-NEXT: lw a1, 0(s0) -; RV32I-WITHFP-NEXT: lw a2, 8(s0) -; RV32I-WITHFP-NEXT: lw a3, 16(s0) -; RV32I-WITHFP-NEXT: lw a4, 20(s0) -; RV32I-WITHFP-NEXT: add a1, a7, a1 -; RV32I-WITHFP-NEXT: add a1, a1, a2 -; RV32I-WITHFP-NEXT: add a1, a1, a3 -; RV32I-WITHFP-NEXT: add a1, a1, a4 +; RV32I-WITHFP-NEXT: lw a1, 8(s0) +; RV32I-WITHFP-NEXT: lw a2, 0(s0) +; RV32I-WITHFP-NEXT: lw a3, 20(s0) +; RV32I-WITHFP-NEXT: lw a4, 16(s0) +; RV32I-WITHFP-NEXT: add a0, a0, a7 +; RV32I-WITHFP-NEXT: add a1, a2, a1 ; RV32I-WITHFP-NEXT: add a0, a0, a1 +; RV32I-WITHFP-NEXT: add a3, a4, a3 +; RV32I-WITHFP-NEXT: add a0, a0, a3 ; RV32I-WITHFP-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-WITHFP-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll --- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll @@ -53,12 +53,12 @@ ; RV64I-NEXT: andi a0, a0, 255 ; RV64I-NEXT: slli a1, a1, 48 ; RV64I-NEXT: srli a1, a1, 48 -; RV64I-NEXT: add a1, a1, a2 -; RV64I-NEXT: xor a2, a4, t1 -; RV64I-NEXT: xor a3, a3, a7 -; RV64I-NEXT: or a2, a3, a2 -; RV64I-NEXT: seqz a2, a2 -; RV64I-NEXT: add a1, a2, a1 +; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a0, a0, a1 +; RV64I-NEXT: xor a1, a4, t1 +; RV64I-NEXT: xor a2, a3, a7 +; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: seqz a1, a1 ; RV64I-NEXT: add a0, a0, a5 ; RV64I-NEXT: add a0, a0, a6 ; RV64I-NEXT: add a0, a0, t0 @@ -119,8 +119,8 @@ ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: xor a2, a3, a2 -; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: or a0, a2, a0 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: ret %1 = icmp eq i256 %a, %b @@ -174,8 +174,8 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: xor a0, a7, a0 ; RV64I-NEXT: xor a1, a2, a1 -; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: seqz a0, a0 ; RV64I-NEXT: ret %1 = icmp eq i256 %h, %j @@ -328,15 +328,15 @@ ; should only be 8-byte aligned ; RV64I-LABEL: callee_aligned_stack: ; RV64I: # %bb.0: -; RV64I-NEXT: ld a0, 0(sp) -; RV64I-NEXT: ld a1, 16(sp) -; RV64I-NEXT: ld a2, 32(sp) +; RV64I-NEXT: ld a0, 32(sp) +; RV64I-NEXT: ld a1, 0(sp) +; RV64I-NEXT: ld a2, 16(sp) ; RV64I-NEXT: ld a3, 40(sp) -; RV64I-NEXT: add a0, a7, a0 -; RV64I-NEXT: add a0, a0, a1 -; RV64I-NEXT: add a0, a0, a2 +; RV64I-NEXT: add a5, a5, a7 +; RV64I-NEXT: add a1, a5, a1 +; RV64I-NEXT: add a0, a2, a0 +; RV64I-NEXT: add a0, a1, a0 ; RV64I-NEXT: add a0, a0, a3 -; RV64I-NEXT: add a0, a5, a0 ; RV64I-NEXT: ret %f_trunc = trunc i128 %f to i64 %1 = add i64 %f_trunc, %g diff --git a/llvm/test/CodeGen/RISCV/copysign-casts.ll b/llvm/test/CodeGen/RISCV/copysign-casts.ll --- a/llvm/test/CodeGen/RISCV/copysign-casts.ll +++ b/llvm/test/CodeGen/RISCV/copysign-casts.ll @@ -447,8 +447,8 @@ ; RV32IF-NEXT: srli a1, a1, 16 ; RV32IF-NEXT: slli a0, a0, 17 ; RV32IF-NEXT: srli a0, a0, 17 -; RV32IF-NEXT: lui a2, 1048560 -; RV32IF-NEXT: or a1, a1, a2 +; RV32IF-NEXT: or a0, a0, a1 +; RV32IF-NEXT: lui a1, 1048560 ; RV32IF-NEXT: or a0, a0, a1 ; RV32IF-NEXT: fmv.w.x fa0, a0 ; RV32IF-NEXT: ret @@ -462,8 +462,8 @@ ; RV32IFD-NEXT: srli a1, a1, 16 ; RV32IFD-NEXT: slli a0, a0, 17 ; RV32IFD-NEXT: srli a0, a0, 17 -; RV32IFD-NEXT: lui a2, 1048560 -; RV32IFD-NEXT: or a1, a1, a2 +; RV32IFD-NEXT: or a0, a0, a1 +; RV32IFD-NEXT: lui a1, 1048560 ; RV32IFD-NEXT: or a0, a0, a1 ; RV32IFD-NEXT: fmv.w.x fa0, a0 ; RV32IFD-NEXT: ret @@ -477,8 +477,8 @@ ; RV64IFD-NEXT: srli a1, a1, 16 ; RV64IFD-NEXT: slli a0, a0, 49 ; RV64IFD-NEXT: srli a0, a0, 49 -; RV64IFD-NEXT: lui a2, 1048560 -; RV64IFD-NEXT: or a1, a1, a2 +; RV64IFD-NEXT: or a0, a0, a1 +; RV64IFD-NEXT: lui a1, 1048560 ; RV64IFD-NEXT: or a0, a0, a1 ; RV64IFD-NEXT: fmv.w.x fa0, a0 ; RV64IFD-NEXT: ret @@ -581,8 +581,8 @@ ; RV32IF-NEXT: srli a1, a1, 16 ; RV32IF-NEXT: slli a0, a0, 17 ; RV32IF-NEXT: srli a0, a0, 17 -; RV32IF-NEXT: lui a2, 1048560 -; RV32IF-NEXT: or a1, a1, a2 +; RV32IF-NEXT: or a0, a0, a1 +; RV32IF-NEXT: lui a1, 1048560 ; RV32IF-NEXT: or a0, a0, a1 ; RV32IF-NEXT: fmv.w.x fa0, a0 ; RV32IF-NEXT: ret @@ -599,7 +599,7 @@ ; RV32IFD-NEXT: slli a1, a1, 17 ; RV32IFD-NEXT: srli a1, a1, 17 ; RV32IFD-NEXT: lui a2, 1048560 -; RV32IFD-NEXT: or a0, a0, a2 +; RV32IFD-NEXT: or a1, a1, a2 ; RV32IFD-NEXT: or a0, a1, a0 ; RV32IFD-NEXT: fmv.w.x fa0, a0 ; RV32IFD-NEXT: addi sp, sp, 16 @@ -615,7 +615,7 @@ ; RV64IFD-NEXT: slli a0, a0, 63 ; RV64IFD-NEXT: srli a0, a0, 48 ; RV64IFD-NEXT: lui a2, 1048560 -; RV64IFD-NEXT: or a0, a0, a2 +; RV64IFD-NEXT: or a1, a1, a2 ; RV64IFD-NEXT: or a0, a1, a0 ; RV64IFD-NEXT: fmv.w.x fa0, a0 ; RV64IFD-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll --- a/llvm/test/CodeGen/RISCV/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll @@ -80,11 +80,11 @@ ; RV32-NEXT: addi a3, a3, -820 ; RV32-NEXT: mul a3, a5, a3 ; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a0, a1, a4 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: add a1, a6, a0 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/div-pow2.ll b/llvm/test/CodeGen/RISCV/div-pow2.ll --- a/llvm/test/CodeGen/RISCV/div-pow2.ll +++ b/llvm/test/CodeGen/RISCV/div-pow2.ll @@ -213,8 +213,8 @@ ; RV32I-NEXT: neg a0, a3 ; RV32I-NEXT: snez a2, a3 ; RV32I-NEXT: srai a1, a1, 1 -; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a1, a1, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: sdiv64_pow2_negative_2: @@ -269,8 +269,8 @@ ; RV32I-NEXT: neg a0, a3 ; RV32I-NEXT: snez a2, a3 ; RV32I-NEXT: srai a1, a1, 11 -; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a1, a1, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: sdiv64_pow2_negative_2048: @@ -326,8 +326,8 @@ ; RV32I-NEXT: neg a0, a3 ; RV32I-NEXT: snez a2, a3 ; RV32I-NEXT: srai a1, a1, 12 -; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a1, a1, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: sdiv64_pow2_negative_4096: @@ -383,8 +383,8 @@ ; RV32I-NEXT: neg a0, a3 ; RV32I-NEXT: snez a2, a3 ; RV32I-NEXT: srai a1, a1, 16 -; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a1, a1, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: sdiv64_pow2_negative_65536: @@ -404,11 +404,11 @@ ; RV32I-LABEL: sdiv64_pow2_8589934592: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: srli a2, a1, 31 -; RV32I-NEXT: srai a3, a1, 31 -; RV32I-NEXT: add a3, a0, a3 -; RV32I-NEXT: sltu a0, a3, a0 -; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a1, a1, a0 +; RV32I-NEXT: add a2, a1, a2 +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: sltu a0, a1, a0 +; RV32I-NEXT: add a1, a2, a0 ; RV32I-NEXT: srai a0, a1, 1 ; RV32I-NEXT: srai a1, a1, 31 ; RV32I-NEXT: ret @@ -429,16 +429,16 @@ ; RV32I-LABEL: sdiv64_pow2_negative_8589934592: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: srli a2, a1, 31 -; RV32I-NEXT: srai a3, a1, 31 -; RV32I-NEXT: add a3, a0, a3 -; RV32I-NEXT: sltu a0, a3, a0 +; RV32I-NEXT: add a2, a1, a2 +; RV32I-NEXT: srai a1, a1, 31 +; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: sltu a0, a1, a0 ; RV32I-NEXT: add a0, a2, a0 -; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: srai a1, a0, 31 ; RV32I-NEXT: srai a0, a0, 1 ; RV32I-NEXT: snez a2, a0 -; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a1, a1, a2 ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll --- a/llvm/test/CodeGen/RISCV/div.ll +++ b/llvm/test/CodeGen/RISCV/div.ll @@ -195,11 +195,11 @@ ; RV32IM-NEXT: addi a3, a3, -820 ; RV32IM-NEXT: mul a3, a5, a3 ; RV32IM-NEXT: mulhu a6, a5, a4 +; RV32IM-NEXT: add a3, a6, a3 ; RV32IM-NEXT: sltu a0, a0, a2 ; RV32IM-NEXT: sub a1, a1, a0 -; RV32IM-NEXT: mul a0, a1, a4 -; RV32IM-NEXT: add a0, a3, a0 -; RV32IM-NEXT: add a1, a6, a0 +; RV32IM-NEXT: mul a1, a1, a4 +; RV32IM-NEXT: add a1, a3, a1 ; RV32IM-NEXT: mul a0, a5, a4 ; RV32IM-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -3227,14 +3227,14 @@ ; RV32IF-NEXT: or a4, a1, a0 ; RV32IF-NEXT: snez a4, a4 ; RV32IF-NEXT: addi a4, a4, -1 +; RV32IF-NEXT: and a3, a4, a3 ; RV32IF-NEXT: xori a0, a0, 1 ; RV32IF-NEXT: or a0, a0, a1 ; RV32IF-NEXT: seqz a0, a0 ; RV32IF-NEXT: addi a1, a0, -1 ; RV32IF-NEXT: and a0, a1, a3 -; RV32IF-NEXT: and a0, a0, a4 +; RV32IF-NEXT: and a2, a4, a2 ; RV32IF-NEXT: and a1, a1, a2 -; RV32IF-NEXT: and a1, a1, a4 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: addi sp, sp, 32 ; RV32IF-NEXT: ret @@ -3248,11 +3248,11 @@ ; RV64-NEXT: call __fixunsdfti@plt ; RV64-NEXT: snez a2, a1 ; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a0, a2, a0 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: seqz a1, a1 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -3272,14 +3272,14 @@ ; RV32IFD-NEXT: or a4, a1, a0 ; RV32IFD-NEXT: snez a4, a4 ; RV32IFD-NEXT: addi a4, a4, -1 +; RV32IFD-NEXT: and a3, a4, a3 ; RV32IFD-NEXT: xori a0, a0, 1 ; RV32IFD-NEXT: or a0, a0, a1 ; RV32IFD-NEXT: seqz a0, a0 ; RV32IFD-NEXT: addi a1, a0, -1 ; RV32IFD-NEXT: and a0, a1, a3 -; RV32IFD-NEXT: and a0, a0, a4 +; RV32IFD-NEXT: and a2, a4, a2 ; RV32IFD-NEXT: and a1, a1, a2 -; RV32IFD-NEXT: and a1, a1, a4 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: addi sp, sp, 32 ; RV32IFD-NEXT: ret @@ -3326,14 +3326,14 @@ ; RV32IF-NEXT: seqz a6, a1 ; RV32IF-NEXT: .LBB47_7: # %entry ; RV32IF-NEXT: neg a6, a6 +; RV32IF-NEXT: and a3, a6, a3 ; RV32IF-NEXT: xori a1, a1, 1 ; RV32IF-NEXT: or a1, a1, a0 ; RV32IF-NEXT: seqz a1, a1 ; RV32IF-NEXT: addi a1, a1, -1 ; RV32IF-NEXT: and a3, a1, a3 -; RV32IF-NEXT: and a3, a3, a6 +; RV32IF-NEXT: and a4, a6, a4 ; RV32IF-NEXT: and a1, a1, a4 -; RV32IF-NEXT: and a1, a1, a6 ; RV32IF-NEXT: neg a4, a5 ; RV32IF-NEXT: and a4, a4, a0 ; RV32IF-NEXT: mv a0, a3 @@ -3376,11 +3376,11 @@ ; RV64-NEXT: .LBB47_2: # %entry ; RV64-NEXT: slti a3, a1, 1 ; RV64-NEXT: neg a3, a3 +; RV64-NEXT: and a0, a3, a0 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: seqz a1, a1 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: and a0, a0, a3 ; RV64-NEXT: beqz a2, .LBB47_4 ; RV64-NEXT: # %bb.3: # %entry ; RV64-NEXT: sgtz a1, a2 @@ -3424,14 +3424,14 @@ ; RV32IFD-NEXT: seqz a6, a1 ; RV32IFD-NEXT: .LBB47_7: # %entry ; RV32IFD-NEXT: neg a6, a6 +; RV32IFD-NEXT: and a3, a6, a3 ; RV32IFD-NEXT: xori a1, a1, 1 ; RV32IFD-NEXT: or a1, a1, a0 ; RV32IFD-NEXT: seqz a1, a1 ; RV32IFD-NEXT: addi a1, a1, -1 ; RV32IFD-NEXT: and a3, a1, a3 -; RV32IFD-NEXT: and a3, a3, a6 +; RV32IFD-NEXT: and a4, a6, a4 ; RV32IFD-NEXT: and a1, a1, a4 -; RV32IFD-NEXT: and a1, a1, a6 ; RV32IFD-NEXT: neg a4, a5 ; RV32IFD-NEXT: and a4, a4, a0 ; RV32IFD-NEXT: mv a0, a3 @@ -3590,14 +3590,14 @@ ; RV32-NEXT: or a4, a1, a0 ; RV32-NEXT: snez a4, a4 ; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: and a3, a4, a3 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: addi a1, a0, -1 ; RV32-NEXT: and a0, a1, a3 -; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: and a2, a4, a2 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: and a1, a1, a4 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -3611,11 +3611,11 @@ ; RV64-NEXT: call __fixunssfti@plt ; RV64-NEXT: snez a2, a1 ; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a0, a2, a0 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: seqz a1, a1 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -3660,14 +3660,14 @@ ; RV32-NEXT: seqz a6, a1 ; RV32-NEXT: .LBB50_7: # %entry ; RV32-NEXT: neg a6, a6 +; RV32-NEXT: and a3, a6, a3 ; RV32-NEXT: xori a1, a1, 1 ; RV32-NEXT: or a1, a1, a0 ; RV32-NEXT: seqz a1, a1 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a3, a1, a3 -; RV32-NEXT: and a3, a3, a6 +; RV32-NEXT: and a4, a6, a4 ; RV32-NEXT: and a1, a1, a4 -; RV32-NEXT: and a1, a1, a6 ; RV32-NEXT: neg a4, a5 ; RV32-NEXT: and a4, a4, a0 ; RV32-NEXT: mv a0, a3 @@ -3710,11 +3710,11 @@ ; RV64-NEXT: .LBB50_2: # %entry ; RV64-NEXT: slti a3, a1, 1 ; RV64-NEXT: neg a3, a3 +; RV64-NEXT: and a0, a3, a0 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: seqz a1, a1 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: and a0, a0, a3 ; RV64-NEXT: beqz a2, .LBB50_4 ; RV64-NEXT: # %bb.3: # %entry ; RV64-NEXT: sgtz a1, a2 @@ -3901,14 +3901,14 @@ ; RV32-NEXT: or a4, a1, a0 ; RV32-NEXT: snez a4, a4 ; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: and a3, a4, a3 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: addi a1, a0, -1 ; RV32-NEXT: and a0, a1, a3 -; RV32-NEXT: and a0, a0, a4 +; RV32-NEXT: and a2, a4, a2 ; RV32-NEXT: and a1, a1, a2 -; RV32-NEXT: and a1, a1, a4 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 32 ; RV32-NEXT: ret @@ -3924,11 +3924,11 @@ ; RV64-NEXT: call __fixunssfti@plt ; RV64-NEXT: snez a2, a1 ; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: and a0, a2, a0 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: seqz a1, a1 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: and a0, a0, a2 ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret @@ -3975,14 +3975,14 @@ ; RV32-NEXT: seqz a6, a1 ; RV32-NEXT: .LBB53_7: # %entry ; RV32-NEXT: neg a6, a6 +; RV32-NEXT: and a3, a6, a3 ; RV32-NEXT: xori a1, a1, 1 ; RV32-NEXT: or a1, a1, a0 ; RV32-NEXT: seqz a1, a1 ; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a3, a1, a3 -; RV32-NEXT: and a3, a3, a6 +; RV32-NEXT: and a4, a6, a4 ; RV32-NEXT: and a1, a1, a4 -; RV32-NEXT: and a1, a1, a6 ; RV32-NEXT: neg a4, a5 ; RV32-NEXT: and a4, a4, a0 ; RV32-NEXT: mv a0, a3 @@ -4027,11 +4027,11 @@ ; RV64-NEXT: .LBB53_2: # %entry ; RV64-NEXT: slti a3, a1, 1 ; RV64-NEXT: neg a3, a3 +; RV64-NEXT: and a0, a3, a0 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: seqz a1, a1 ; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a0, a1, a0 -; RV64-NEXT: and a0, a0, a3 ; RV64-NEXT: beqz a2, .LBB53_4 ; RV64-NEXT: # %bb.3: # %entry ; RV64-NEXT: sgtz a1, a2 diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll @@ -5574,17 +5574,17 @@ ; CHECK-NOV-NEXT: call __fixunsdfti@plt ; CHECK-NOV-NEXT: snez a2, a1 ; CHECK-NOV-NEXT: addi a2, a2, -1 +; CHECK-NOV-NEXT: and a0, a2, a0 ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: seqz a1, a1 ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: and a0, a1, a0 -; CHECK-NOV-NEXT: and a0, a0, a2 ; CHECK-NOV-NEXT: snez a1, s1 ; CHECK-NOV-NEXT: addi a1, a1, -1 +; CHECK-NOV-NEXT: and a1, a1, s0 ; CHECK-NOV-NEXT: addi s1, s1, -1 ; CHECK-NOV-NEXT: seqz a2, s1 ; CHECK-NOV-NEXT: addi a2, a2, -1 -; CHECK-NOV-NEXT: and a2, a2, s0 ; CHECK-NOV-NEXT: and a1, a2, a1 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -5622,18 +5622,18 @@ ; CHECK-V-NEXT: call __fixunsdfti@plt ; CHECK-V-NEXT: snez a2, s1 ; CHECK-V-NEXT: addi a2, a2, -1 +; CHECK-V-NEXT: and a2, a2, s0 ; CHECK-V-NEXT: addi s1, s1, -1 ; CHECK-V-NEXT: seqz a3, s1 ; CHECK-V-NEXT: addi a3, a3, -1 -; CHECK-V-NEXT: and a3, a3, s0 ; CHECK-V-NEXT: and a2, a3, a2 ; CHECK-V-NEXT: snez a3, a1 ; CHECK-V-NEXT: addi a3, a3, -1 +; CHECK-V-NEXT: and a0, a3, a0 ; CHECK-V-NEXT: addi a1, a1, -1 ; CHECK-V-NEXT: seqz a1, a1 ; CHECK-V-NEXT: addi a1, a1, -1 ; CHECK-V-NEXT: and a0, a1, a0 -; CHECK-V-NEXT: and a0, a0, a3 ; CHECK-V-NEXT: sd a0, 24(sp) ; CHECK-V-NEXT: sd a2, 32(sp) ; CHECK-V-NEXT: addi a0, sp, 24 @@ -5695,10 +5695,10 @@ ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: slti a0, s1, 1 ; CHECK-NOV-NEXT: neg a0, a0 +; CHECK-NOV-NEXT: and a0, a0, s0 ; CHECK-NOV-NEXT: addi s1, s1, -1 ; CHECK-NOV-NEXT: seqz a5, s1 ; CHECK-NOV-NEXT: addi a5, a5, -1 -; CHECK-NOV-NEXT: and a5, a5, s0 ; CHECK-NOV-NEXT: and a0, a5, a0 ; CHECK-NOV-NEXT: beqz a4, .LBB47_6 ; CHECK-NOV-NEXT: # %bb.5: # %entry @@ -6064,17 +6064,17 @@ ; CHECK-NOV-NEXT: call __fixunssfti@plt ; CHECK-NOV-NEXT: snez a2, a1 ; CHECK-NOV-NEXT: addi a2, a2, -1 +; CHECK-NOV-NEXT: and a0, a2, a0 ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: seqz a1, a1 ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: and a0, a1, a0 -; CHECK-NOV-NEXT: and a0, a0, a2 ; CHECK-NOV-NEXT: snez a1, s1 ; CHECK-NOV-NEXT: addi a1, a1, -1 +; CHECK-NOV-NEXT: and a1, a1, s0 ; CHECK-NOV-NEXT: addi s1, s1, -1 ; CHECK-NOV-NEXT: seqz a2, s1 ; CHECK-NOV-NEXT: addi a2, a2, -1 -; CHECK-NOV-NEXT: and a2, a2, s0 ; CHECK-NOV-NEXT: and a1, a2, a1 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -6112,18 +6112,18 @@ ; CHECK-V-NEXT: call __fixunssfti@plt ; CHECK-V-NEXT: snez a2, s1 ; CHECK-V-NEXT: addi a2, a2, -1 +; CHECK-V-NEXT: and a2, a2, s0 ; CHECK-V-NEXT: addi s1, s1, -1 ; CHECK-V-NEXT: seqz a3, s1 ; CHECK-V-NEXT: addi a3, a3, -1 -; CHECK-V-NEXT: and a3, a3, s0 ; CHECK-V-NEXT: and a2, a3, a2 ; CHECK-V-NEXT: snez a3, a1 ; CHECK-V-NEXT: addi a3, a3, -1 +; CHECK-V-NEXT: and a0, a3, a0 ; CHECK-V-NEXT: addi a1, a1, -1 ; CHECK-V-NEXT: seqz a1, a1 ; CHECK-V-NEXT: addi a1, a1, -1 ; CHECK-V-NEXT: and a0, a1, a0 -; CHECK-V-NEXT: and a0, a0, a3 ; CHECK-V-NEXT: sd a0, 24(sp) ; CHECK-V-NEXT: sd a2, 32(sp) ; CHECK-V-NEXT: addi a0, sp, 24 @@ -6185,10 +6185,10 @@ ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: slti a0, s1, 1 ; CHECK-NOV-NEXT: neg a0, a0 +; CHECK-NOV-NEXT: and a0, a0, s0 ; CHECK-NOV-NEXT: addi s1, s1, -1 ; CHECK-NOV-NEXT: seqz a5, s1 ; CHECK-NOV-NEXT: addi a5, a5, -1 -; CHECK-NOV-NEXT: and a5, a5, s0 ; CHECK-NOV-NEXT: and a0, a5, a0 ; CHECK-NOV-NEXT: beqz a4, .LBB50_6 ; CHECK-NOV-NEXT: # %bb.5: # %entry @@ -6549,17 +6549,17 @@ ; CHECK-NOV-NEXT: call __fixunssfti@plt ; CHECK-NOV-NEXT: snez a2, a1 ; CHECK-NOV-NEXT: addi a2, a2, -1 +; CHECK-NOV-NEXT: and a0, a2, a0 ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: seqz a1, a1 ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: and a0, a1, a0 -; CHECK-NOV-NEXT: and a0, a0, a2 ; CHECK-NOV-NEXT: snez a1, s2 ; CHECK-NOV-NEXT: addi a1, a1, -1 +; CHECK-NOV-NEXT: and a1, a1, s1 ; CHECK-NOV-NEXT: addi s2, s2, -1 ; CHECK-NOV-NEXT: seqz a2, s2 ; CHECK-NOV-NEXT: addi a2, a2, -1 -; CHECK-NOV-NEXT: and a2, a2, s1 ; CHECK-NOV-NEXT: and a1, a2, a1 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -6591,17 +6591,17 @@ ; CHECK-V-NEXT: call __fixunssfti@plt ; CHECK-V-NEXT: snez a2, a1 ; CHECK-V-NEXT: addi a2, a2, -1 +; CHECK-V-NEXT: and a0, a2, a0 ; CHECK-V-NEXT: addi a1, a1, -1 ; CHECK-V-NEXT: seqz a1, a1 ; CHECK-V-NEXT: addi a1, a1, -1 ; CHECK-V-NEXT: and a0, a1, a0 -; CHECK-V-NEXT: and a0, a0, a2 ; CHECK-V-NEXT: snez a1, s2 ; CHECK-V-NEXT: addi a1, a1, -1 +; CHECK-V-NEXT: and a1, a1, s1 ; CHECK-V-NEXT: addi s2, s2, -1 ; CHECK-V-NEXT: seqz a2, s2 ; CHECK-V-NEXT: addi a2, a2, -1 -; CHECK-V-NEXT: and a2, a2, s1 ; CHECK-V-NEXT: and a1, a2, a1 ; CHECK-V-NEXT: sd a1, 8(sp) ; CHECK-V-NEXT: sd a0, 0(sp) @@ -6664,10 +6664,10 @@ ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: slti a0, s1, 1 ; CHECK-NOV-NEXT: neg a0, a0 +; CHECK-NOV-NEXT: and a0, a0, s0 ; CHECK-NOV-NEXT: addi s1, s1, -1 ; CHECK-NOV-NEXT: seqz a5, s1 ; CHECK-NOV-NEXT: addi a5, a5, -1 -; CHECK-NOV-NEXT: and a5, a5, s0 ; CHECK-NOV-NEXT: and a0, a5, a0 ; CHECK-NOV-NEXT: beqz a4, .LBB53_6 ; CHECK-NOV-NEXT: # %bb.5: # %entry @@ -6727,10 +6727,10 @@ ; CHECK-V-NEXT: addi a1, a1, -1 ; CHECK-V-NEXT: slti a0, s1, 1 ; CHECK-V-NEXT: neg a0, a0 +; CHECK-V-NEXT: and a0, a0, s0 ; CHECK-V-NEXT: addi s1, s1, -1 ; CHECK-V-NEXT: seqz a5, s1 ; CHECK-V-NEXT: addi a5, a5, -1 -; CHECK-V-NEXT: and a5, a5, s0 ; CHECK-V-NEXT: and a0, a5, a0 ; CHECK-V-NEXT: beqz a4, .LBB53_6 ; CHECK-V-NEXT: # %bb.5: # %entry diff --git a/llvm/test/CodeGen/RISCV/iabs.ll b/llvm/test/CodeGen/RISCV/iabs.ll --- a/llvm/test/CodeGen/RISCV/iabs.ll +++ b/llvm/test/CodeGen/RISCV/iabs.ll @@ -225,8 +225,8 @@ ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: snez a2, a0 ; RV32I-NEXT: neg a0, a0 -; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a1, a1, a2 ; RV32I-NEXT: .LBB6_2: ; RV32I-NEXT: ret ; @@ -236,8 +236,8 @@ ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: snez a2, a0 ; RV32ZBB-NEXT: neg a0, a0 -; RV32ZBB-NEXT: neg a2, a2 -; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a1, a1, a2 ; RV32ZBB-NEXT: .LBB6_2: ; RV32ZBB-NEXT: ret ; @@ -264,8 +264,8 @@ ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: snez a2, a0 ; RV32I-NEXT: neg a0, a0 -; RV32I-NEXT: neg a2, a2 -; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a1, a1, a2 ; RV32I-NEXT: .LBB7_2: ; RV32I-NEXT: ret ; @@ -275,8 +275,8 @@ ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: snez a2, a0 ; RV32ZBB-NEXT: neg a0, a0 -; RV32ZBB-NEXT: neg a2, a2 -; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a1, a1, a2 ; RV32ZBB-NEXT: .LBB7_2: ; RV32ZBB-NEXT: ret ; @@ -301,64 +301,64 @@ define i128 @abs128(i128 %x) { ; RV32I-LABEL: abs128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a4, 0(a1) -; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: lw a2, 12(a1) -; RV32I-NEXT: snez a5, a4 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a2, 4(a1) +; RV32I-NEXT: lw a4, 12(a1) +; RV32I-NEXT: snez a5, a3 ; RV32I-NEXT: mv a6, a5 -; RV32I-NEXT: beqz a3, .LBB8_2 +; RV32I-NEXT: beqz a2, .LBB8_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: snez a6, a3 +; RV32I-NEXT: snez a6, a2 ; RV32I-NEXT: .LBB8_2: ; RV32I-NEXT: lw a1, 8(a1) -; RV32I-NEXT: bgez a2, .LBB8_4 +; RV32I-NEXT: bgez a4, .LBB8_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: neg a7, a1 ; RV32I-NEXT: sltu t0, a7, a6 ; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: add a1, a4, a1 ; RV32I-NEXT: add a1, a1, t0 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a2, a1, a2 +; RV32I-NEXT: neg a4, a1 ; RV32I-NEXT: sub a1, a7, a6 -; RV32I-NEXT: add a3, a3, a5 +; RV32I-NEXT: add a2, a2, a5 +; RV32I-NEXT: neg a2, a2 ; RV32I-NEXT: neg a3, a3 -; RV32I-NEXT: neg a4, a4 ; RV32I-NEXT: .LBB8_4: -; RV32I-NEXT: sw a4, 0(a0) +; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: abs128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a4, 0(a1) -; RV32ZBB-NEXT: lw a3, 4(a1) -; RV32ZBB-NEXT: lw a2, 12(a1) -; RV32ZBB-NEXT: snez a5, a4 +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw a2, 4(a1) +; RV32ZBB-NEXT: lw a4, 12(a1) +; RV32ZBB-NEXT: snez a5, a3 ; RV32ZBB-NEXT: mv a6, a5 -; RV32ZBB-NEXT: beqz a3, .LBB8_2 +; RV32ZBB-NEXT: beqz a2, .LBB8_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: snez a6, a3 +; RV32ZBB-NEXT: snez a6, a2 ; RV32ZBB-NEXT: .LBB8_2: ; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: bgez a2, .LBB8_4 +; RV32ZBB-NEXT: bgez a4, .LBB8_4 ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: neg a7, a1 ; RV32ZBB-NEXT: sltu t0, a7, a6 ; RV32ZBB-NEXT: snez a1, a1 +; RV32ZBB-NEXT: add a1, a4, a1 ; RV32ZBB-NEXT: add a1, a1, t0 -; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a2, a1, a2 +; RV32ZBB-NEXT: neg a4, a1 ; RV32ZBB-NEXT: sub a1, a7, a6 -; RV32ZBB-NEXT: add a3, a3, a5 +; RV32ZBB-NEXT: add a2, a2, a5 +; RV32ZBB-NEXT: neg a2, a2 ; RV32ZBB-NEXT: neg a3, a3 -; RV32ZBB-NEXT: neg a4, a4 ; RV32ZBB-NEXT: .LBB8_4: -; RV32ZBB-NEXT: sw a4, 0(a0) +; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a1, 8(a0) -; RV32ZBB-NEXT: sw a3, 4(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: abs128: @@ -367,8 +367,8 @@ ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: snez a2, a0 ; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: neg a2, a2 -; RV64I-NEXT: sub a1, a2, a1 +; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: sub a1, a1, a2 ; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: ret ; @@ -378,8 +378,8 @@ ; RV64ZBB-NEXT: # %bb.1: ; RV64ZBB-NEXT: snez a2, a0 ; RV64ZBB-NEXT: neg a0, a0 -; RV64ZBB-NEXT: neg a2, a2 -; RV64ZBB-NEXT: sub a1, a2, a1 +; RV64ZBB-NEXT: neg a1, a1 +; RV64ZBB-NEXT: sub a1, a1, a2 ; RV64ZBB-NEXT: .LBB8_2: ; RV64ZBB-NEXT: ret %abs = tail call i128 @llvm.abs.i128(i128 %x, i1 true) @@ -389,64 +389,64 @@ define i128 @select_abs128(i128 %x) { ; RV32I-LABEL: select_abs128: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a4, 0(a1) -; RV32I-NEXT: lw a3, 4(a1) -; RV32I-NEXT: lw a2, 12(a1) -; RV32I-NEXT: snez a5, a4 +; RV32I-NEXT: lw a3, 0(a1) +; RV32I-NEXT: lw a2, 4(a1) +; RV32I-NEXT: lw a4, 12(a1) +; RV32I-NEXT: snez a5, a3 ; RV32I-NEXT: mv a6, a5 -; RV32I-NEXT: beqz a3, .LBB9_2 +; RV32I-NEXT: beqz a2, .LBB9_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: snez a6, a3 +; RV32I-NEXT: snez a6, a2 ; RV32I-NEXT: .LBB9_2: ; RV32I-NEXT: lw a1, 8(a1) -; RV32I-NEXT: bgez a2, .LBB9_4 +; RV32I-NEXT: bgez a4, .LBB9_4 ; RV32I-NEXT: # %bb.3: ; RV32I-NEXT: neg a7, a1 ; RV32I-NEXT: sltu t0, a7, a6 ; RV32I-NEXT: snez a1, a1 +; RV32I-NEXT: add a1, a4, a1 ; RV32I-NEXT: add a1, a1, t0 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a2, a1, a2 +; RV32I-NEXT: neg a4, a1 ; RV32I-NEXT: sub a1, a7, a6 -; RV32I-NEXT: add a3, a3, a5 +; RV32I-NEXT: add a2, a2, a5 +; RV32I-NEXT: neg a2, a2 ; RV32I-NEXT: neg a3, a3 -; RV32I-NEXT: neg a4, a4 ; RV32I-NEXT: .LBB9_4: -; RV32I-NEXT: sw a4, 0(a0) +; RV32I-NEXT: sw a3, 0(a0) ; RV32I-NEXT: sw a1, 8(a0) -; RV32I-NEXT: sw a3, 4(a0) -; RV32I-NEXT: sw a2, 12(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: sw a4, 12(a0) ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: select_abs128: ; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: lw a4, 0(a1) -; RV32ZBB-NEXT: lw a3, 4(a1) -; RV32ZBB-NEXT: lw a2, 12(a1) -; RV32ZBB-NEXT: snez a5, a4 +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw a2, 4(a1) +; RV32ZBB-NEXT: lw a4, 12(a1) +; RV32ZBB-NEXT: snez a5, a3 ; RV32ZBB-NEXT: mv a6, a5 -; RV32ZBB-NEXT: beqz a3, .LBB9_2 +; RV32ZBB-NEXT: beqz a2, .LBB9_2 ; RV32ZBB-NEXT: # %bb.1: -; RV32ZBB-NEXT: snez a6, a3 +; RV32ZBB-NEXT: snez a6, a2 ; RV32ZBB-NEXT: .LBB9_2: ; RV32ZBB-NEXT: lw a1, 8(a1) -; RV32ZBB-NEXT: bgez a2, .LBB9_4 +; RV32ZBB-NEXT: bgez a4, .LBB9_4 ; RV32ZBB-NEXT: # %bb.3: ; RV32ZBB-NEXT: neg a7, a1 ; RV32ZBB-NEXT: sltu t0, a7, a6 ; RV32ZBB-NEXT: snez a1, a1 +; RV32ZBB-NEXT: add a1, a4, a1 ; RV32ZBB-NEXT: add a1, a1, t0 -; RV32ZBB-NEXT: neg a1, a1 -; RV32ZBB-NEXT: sub a2, a1, a2 +; RV32ZBB-NEXT: neg a4, a1 ; RV32ZBB-NEXT: sub a1, a7, a6 -; RV32ZBB-NEXT: add a3, a3, a5 +; RV32ZBB-NEXT: add a2, a2, a5 +; RV32ZBB-NEXT: neg a2, a2 ; RV32ZBB-NEXT: neg a3, a3 -; RV32ZBB-NEXT: neg a4, a4 ; RV32ZBB-NEXT: .LBB9_4: -; RV32ZBB-NEXT: sw a4, 0(a0) +; RV32ZBB-NEXT: sw a3, 0(a0) ; RV32ZBB-NEXT: sw a1, 8(a0) -; RV32ZBB-NEXT: sw a3, 4(a0) -; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: sw a2, 4(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: select_abs128: @@ -455,8 +455,8 @@ ; RV64I-NEXT: # %bb.1: ; RV64I-NEXT: snez a2, a0 ; RV64I-NEXT: neg a0, a0 -; RV64I-NEXT: neg a2, a2 -; RV64I-NEXT: sub a1, a2, a1 +; RV64I-NEXT: neg a1, a1 +; RV64I-NEXT: sub a1, a1, a2 ; RV64I-NEXT: .LBB9_2: ; RV64I-NEXT: ret ; @@ -466,8 +466,8 @@ ; RV64ZBB-NEXT: # %bb.1: ; RV64ZBB-NEXT: snez a2, a0 ; RV64ZBB-NEXT: neg a0, a0 -; RV64ZBB-NEXT: neg a2, a2 -; RV64ZBB-NEXT: sub a1, a2, a1 +; RV64ZBB-NEXT: neg a1, a1 +; RV64ZBB-NEXT: sub a1, a1, a2 ; RV64ZBB-NEXT: .LBB9_2: ; RV64ZBB-NEXT: ret %1 = icmp slt i128 %x, 0 diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll --- a/llvm/test/CodeGen/RISCV/mul.ll +++ b/llvm/test/CodeGen/RISCV/mul.ll @@ -142,9 +142,9 @@ ; RV32IM: # %bb.0: ; RV32IM-NEXT: mul a3, a0, a3 ; RV32IM-NEXT: mulhu a4, a0, a2 +; RV32IM-NEXT: add a3, a4, a3 ; RV32IM-NEXT: mul a1, a1, a2 ; RV32IM-NEXT: add a1, a3, a1 -; RV32IM-NEXT: add a1, a4, a1 ; RV32IM-NEXT: mul a0, a0, a2 ; RV32IM-NEXT: ret ; @@ -169,8 +169,8 @@ ; RV32I-NEXT: srli a0, a0, 30 ; RV32I-NEXT: slli a4, a1, 2 ; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: add a1, a0, a3 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; @@ -179,8 +179,8 @@ ; RV32IM-NEXT: li a2, 5 ; RV32IM-NEXT: mulhu a2, a0, a2 ; RV32IM-NEXT: slli a3, a1, 2 +; RV32IM-NEXT: add a1, a3, a1 ; RV32IM-NEXT: add a1, a2, a1 -; RV32IM-NEXT: add a1, a1, a3 ; RV32IM-NEXT: slli a2, a0, 2 ; RV32IM-NEXT: add a0, a2, a0 ; RV32IM-NEXT: ret @@ -256,8 +256,8 @@ ; RV32I-NEXT: srli a0, a0, 30 ; RV32I-NEXT: slli a3, a1, 2 ; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: add a0, a0, a2 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: mulhs_positive_constant: @@ -298,11 +298,11 @@ ; RV32I-NEXT: srli a0, a0, 30 ; RV32I-NEXT: slli a4, a1, 2 ; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: snez a2, a3 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: neg a1, a1 -; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: snez a1, a3 +; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: mulhs_negative_constant: @@ -530,8 +530,8 @@ ; RV32I-NEXT: srli a0, a0, 26 ; RV32I-NEXT: slli a4, a1, 6 ; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: add a1, a1, a3 -; RV32I-NEXT: add a1, a0, a1 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: add a1, a0, a3 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret ; @@ -540,8 +540,8 @@ ; RV32IM-NEXT: li a2, 65 ; RV32IM-NEXT: mulhu a2, a0, a2 ; RV32IM-NEXT: slli a3, a1, 6 +; RV32IM-NEXT: add a1, a3, a1 ; RV32IM-NEXT: add a1, a2, a1 -; RV32IM-NEXT: add a1, a1, a3 ; RV32IM-NEXT: slli a2, a0, 6 ; RV32IM-NEXT: add a0, a2, a0 ; RV32IM-NEXT: ret @@ -569,8 +569,8 @@ ; RV32I-NEXT: srli a4, a0, 26 ; RV32I-NEXT: slli a5, a1, 6 ; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: add a1, a1, a3 ; RV32I-NEXT: sub a1, a4, a1 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a0, a2, a0 ; RV32I-NEXT: ret ; @@ -579,8 +579,8 @@ ; RV32IM-NEXT: li a2, 63 ; RV32IM-NEXT: mulhu a2, a0, a2 ; RV32IM-NEXT: slli a3, a1, 6 -; RV32IM-NEXT: sub a1, a2, a1 -; RV32IM-NEXT: add a1, a1, a3 +; RV32IM-NEXT: sub a1, a3, a1 +; RV32IM-NEXT: add a1, a2, a1 ; RV32IM-NEXT: slli a2, a0, 6 ; RV32IM-NEXT: sub a0, a2, a0 ; RV32IM-NEXT: ret @@ -668,7 +668,7 @@ ; RV32I-NEXT: srli a4, a0, 26 ; RV32I-NEXT: slli a5, a1, 6 ; RV32I-NEXT: or a4, a5, a4 -; RV32I-NEXT: add a3, a4, a3 +; RV32I-NEXT: sub a1, a1, a4 ; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret @@ -679,8 +679,8 @@ ; RV32IM-NEXT: sub a1, a1, a2 ; RV32IM-NEXT: li a2, -63 ; RV32IM-NEXT: mulhu a2, a0, a2 -; RV32IM-NEXT: sub a1, a0, a1 -; RV32IM-NEXT: sub a1, a2, a1 +; RV32IM-NEXT: sub a2, a2, a0 +; RV32IM-NEXT: add a1, a2, a1 ; RV32IM-NEXT: slli a2, a0, 6 ; RV32IM-NEXT: sub a0, a0, a2 ; RV32IM-NEXT: ret @@ -709,9 +709,9 @@ ; RV32I-NEXT: srli a0, a0, 26 ; RV32I-NEXT: slli a4, a1, 6 ; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: add a1, a1, a2 -; RV32I-NEXT: snez a2, a3 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: add a0, a0, a2 +; RV32I-NEXT: snez a1, a3 ; RV32I-NEXT: neg a1, a1 ; RV32I-NEXT: sub a1, a1, a0 ; RV32I-NEXT: neg a0, a3 @@ -723,7 +723,7 @@ ; RV32IM-NEXT: add a1, a2, a1 ; RV32IM-NEXT: li a2, -65 ; RV32IM-NEXT: mulhu a2, a0, a2 -; RV32IM-NEXT: add a1, a0, a1 +; RV32IM-NEXT: sub a2, a2, a0 ; RV32IM-NEXT: sub a1, a2, a1 ; RV32IM-NEXT: slli a2, a0, 6 ; RV32IM-NEXT: neg a0, a0 @@ -949,11 +949,11 @@ ; RV32I-NEXT: srli a3, a0, 20 ; RV32I-NEXT: slli a1, a1, 12 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: slli a3, a0, 8 -; RV32I-NEXT: slli a4, a0, 12 -; RV32I-NEXT: add a0, a4, a3 -; RV32I-NEXT: sltu a3, a0, a4 -; RV32I-NEXT: add a2, a2, a3 +; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: slli a3, a0, 12 +; RV32I-NEXT: add a0, a3, a2 +; RV32I-NEXT: sltu a2, a0, a3 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; @@ -993,12 +993,12 @@ ; RV32I-NEXT: srli a3, a0, 20 ; RV32I-NEXT: slli a1, a1, 12 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: slli a3, a0, 8 -; RV32I-NEXT: slli a0, a0, 12 -; RV32I-NEXT: sltu a4, a0, a3 -; RV32I-NEXT: add a2, a2, a4 ; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: sub a0, a0, a3 +; RV32I-NEXT: slli a2, a0, 8 +; RV32I-NEXT: slli a0, a0, 12 +; RV32I-NEXT: sltu a3, a0, a2 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli64_p3840: @@ -1047,8 +1047,8 @@ ; RV32IM-NEXT: slli a2, a2, 8 ; RV32IM-NEXT: mul a1, a1, a2 ; RV32IM-NEXT: mulhu a3, a0, a2 -; RV32IM-NEXT: sub a1, a0, a1 -; RV32IM-NEXT: sub a1, a3, a1 +; RV32IM-NEXT: sub a3, a3, a0 +; RV32IM-NEXT: add a1, a3, a1 ; RV32IM-NEXT: mul a0, a0, a2 ; RV32IM-NEXT: ret ; @@ -1077,12 +1077,12 @@ ; RV32I-NEXT: srli a3, a0, 24 ; RV32I-NEXT: slli a1, a1, 8 ; RV32I-NEXT: or a1, a1, a3 -; RV32I-NEXT: slli a3, a0, 12 -; RV32I-NEXT: slli a0, a0, 8 -; RV32I-NEXT: sltu a4, a0, a3 -; RV32I-NEXT: add a2, a2, a4 ; RV32I-NEXT: sub a1, a1, a2 -; RV32I-NEXT: sub a0, a0, a3 +; RV32I-NEXT: slli a2, a0, 12 +; RV32I-NEXT: slli a0, a0, 8 +; RV32I-NEXT: sltu a3, a0, a2 +; RV32I-NEXT: sub a1, a1, a3 +; RV32I-NEXT: sub a0, a0, a2 ; RV32I-NEXT: ret ; ; RV32IM-LABEL: muli64_m3840: @@ -1091,8 +1091,8 @@ ; RV32IM-NEXT: slli a2, a2, 8 ; RV32IM-NEXT: mul a1, a1, a2 ; RV32IM-NEXT: mulhu a3, a0, a2 -; RV32IM-NEXT: sub a1, a0, a1 -; RV32IM-NEXT: sub a1, a3, a1 +; RV32IM-NEXT: sub a3, a3, a0 +; RV32IM-NEXT: add a1, a3, a1 ; RV32IM-NEXT: mul a0, a0, a2 ; RV32IM-NEXT: ret ; @@ -1126,14 +1126,14 @@ ; RV32I-NEXT: srli a2, a4, 24 ; RV32I-NEXT: slli a7, a3, 8 ; RV32I-NEXT: or a2, a7, a2 -; RV32I-NEXT: sltu a7, a2, a1 -; RV32I-NEXT: srli t0, a3, 20 +; RV32I-NEXT: sltu t0, a2, a1 +; RV32I-NEXT: srli a7, a3, 20 ; RV32I-NEXT: slli t1, a5, 12 -; RV32I-NEXT: or t0, t1, t0 +; RV32I-NEXT: or a7, t1, a7 ; RV32I-NEXT: srli a3, a3, 24 ; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or t1, a5, a3 -; RV32I-NEXT: add t0, t0, a7 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: sub t1, a3, a7 ; RV32I-NEXT: srli a3, a6, 20 ; RV32I-NEXT: slli a5, a4, 12 ; RV32I-NEXT: or a3, a5, a3 @@ -1195,19 +1195,19 @@ ; RV32IM-NEXT: sltu t4, t4, s1 ; RV32IM-NEXT: sltu a7, t1, a7 ; RV32IM-NEXT: mulhu t1, a1, t2 +; RV32IM-NEXT: add a7, t1, a7 ; RV32IM-NEXT: add a7, a7, t4 ; RV32IM-NEXT: sltu t0, t5, t0 ; RV32IM-NEXT: mul a2, a2, a5 -; RV32IM-NEXT: mulhu t2, a3, a5 -; RV32IM-NEXT: sub a3, a3, a2 -; RV32IM-NEXT: sub a2, t2, a3 +; RV32IM-NEXT: mulhu t1, a3, a5 +; RV32IM-NEXT: sub a3, t1, a3 +; RV32IM-NEXT: add a2, a3, a2 ; RV32IM-NEXT: add a1, a4, a1 -; RV32IM-NEXT: sub a1, a1, a2 -; RV32IM-NEXT: sub a1, a1, t0 ; RV32IM-NEXT: sub a1, t3, a1 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: add a1, a1, t0 ; RV32IM-NEXT: add a1, a7, a1 ; RV32IM-NEXT: add a1, a1, s0 -; RV32IM-NEXT: add a1, t1, a1 ; RV32IM-NEXT: mul a2, a4, a5 ; RV32IM-NEXT: sw a2, 0(a0) ; RV32IM-NEXT: sw a6, 4(a0) @@ -1226,12 +1226,12 @@ ; RV64I-NEXT: srli a3, a0, 56 ; RV64I-NEXT: slli a1, a1, 8 ; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: slli a3, a0, 12 -; RV64I-NEXT: slli a0, a0, 8 -; RV64I-NEXT: sltu a4, a0, a3 -; RV64I-NEXT: add a2, a2, a4 ; RV64I-NEXT: sub a1, a1, a2 -; RV64I-NEXT: sub a0, a0, a3 +; RV64I-NEXT: slli a2, a0, 12 +; RV64I-NEXT: slli a0, a0, 8 +; RV64I-NEXT: sltu a3, a0, a2 +; RV64I-NEXT: sub a1, a1, a3 +; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: ret ; ; RV64IM-LABEL: muli128_m3840: @@ -1240,8 +1240,8 @@ ; RV64IM-NEXT: slli a2, a2, 8 ; RV64IM-NEXT: mul a1, a1, a2 ; RV64IM-NEXT: mulhu a3, a0, a2 -; RV64IM-NEXT: sub a1, a0, a1 -; RV64IM-NEXT: sub a1, a3, a1 +; RV64IM-NEXT: sub a3, a3, a0 +; RV64IM-NEXT: add a1, a3, a1 ; RV64IM-NEXT: mul a0, a0, a2 ; RV64IM-NEXT: ret %1 = mul i128 %a, -3840 @@ -1274,9 +1274,9 @@ ; RV32I-NEXT: srli a7, a7, 26 ; RV32I-NEXT: slli t4, a5, 6 ; RV32I-NEXT: or a7, t4, a7 -; RV32I-NEXT: add a7, a7, t1 -; RV32I-NEXT: add a7, a7, t3 ; RV32I-NEXT: sub a5, a5, a7 +; RV32I-NEXT: sub a5, a5, t1 +; RV32I-NEXT: sub a5, a5, t3 ; RV32I-NEXT: sub a7, t2, t0 ; RV32I-NEXT: sub a3, a3, a6 ; RV32I-NEXT: sub a3, a3, a4 @@ -1322,21 +1322,21 @@ ; RV32IM-NEXT: sltu t4, t4, s1 ; RV32IM-NEXT: sltu a7, t1, a7 ; RV32IM-NEXT: mulhu t1, a4, t2 +; RV32IM-NEXT: add a7, t1, a7 ; RV32IM-NEXT: add a7, a7, t4 -; RV32IM-NEXT: slli t2, a2, 6 -; RV32IM-NEXT: sub a2, a2, t2 +; RV32IM-NEXT: slli t1, a2, 6 +; RV32IM-NEXT: sub a2, a2, t1 ; RV32IM-NEXT: mulhu a5, a1, a5 -; RV32IM-NEXT: sub a1, a1, a2 ; RV32IM-NEXT: sub a5, a5, a1 +; RV32IM-NEXT: add a2, a5, a2 ; RV32IM-NEXT: add a4, a3, a4 -; RV32IM-NEXT: sub a4, a4, a5 -; RV32IM-NEXT: neg a1, t5 -; RV32IM-NEXT: sltu a1, a1, t0 -; RV32IM-NEXT: sub a4, a4, a1 ; RV32IM-NEXT: sub a1, t3, a4 +; RV32IM-NEXT: add a1, a1, a2 +; RV32IM-NEXT: neg a2, t5 +; RV32IM-NEXT: sltu a2, a2, t0 +; RV32IM-NEXT: add a1, a1, a2 ; RV32IM-NEXT: add a1, a7, a1 ; RV32IM-NEXT: add a1, a1, s0 -; RV32IM-NEXT: add a1, t1, a1 ; RV32IM-NEXT: slli a2, a3, 6 ; RV32IM-NEXT: sub a3, a3, a2 ; RV32IM-NEXT: sw a3, 0(a0) @@ -1355,7 +1355,7 @@ ; RV64I-NEXT: srli a4, a0, 58 ; RV64I-NEXT: slli a5, a1, 6 ; RV64I-NEXT: or a4, a5, a4 -; RV64I-NEXT: add a3, a4, a3 +; RV64I-NEXT: sub a1, a1, a4 ; RV64I-NEXT: sub a1, a1, a3 ; RV64I-NEXT: sub a0, a0, a2 ; RV64I-NEXT: ret @@ -1366,8 +1366,8 @@ ; RV64IM-NEXT: sub a1, a1, a2 ; RV64IM-NEXT: li a2, -63 ; RV64IM-NEXT: mulhu a2, a0, a2 -; RV64IM-NEXT: sub a1, a0, a1 -; RV64IM-NEXT: sub a1, a2, a1 +; RV64IM-NEXT: sub a2, a2, a0 +; RV64IM-NEXT: add a1, a2, a1 ; RV64IM-NEXT: slli a2, a0, 6 ; RV64IM-NEXT: sub a0, a0, a2 ; RV64IM-NEXT: ret @@ -1441,13 +1441,13 @@ ; RV32I-NEXT: sltu a3, a2, s9 ; RV32I-NEXT: sltu a4, s9, s5 ; RV32I-NEXT: sltu a5, s8, s7 +; RV32I-NEXT: add a5, s6, a5 ; RV32I-NEXT: add a4, a5, a4 +; RV32I-NEXT: add a1, a1, s3 ; RV32I-NEXT: sltu a0, s2, a0 -; RV32I-NEXT: add a0, s3, a0 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: add a0, a4, a0 -; RV32I-NEXT: add a0, a0, a3 -; RV32I-NEXT: add a1, s6, a0 +; RV32I-NEXT: add a1, a0, a3 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 40(sp) # 4-byte Folded Reload @@ -1486,14 +1486,14 @@ ; RV32IM-NEXT: sltu a7, t0, a7 ; RV32IM-NEXT: sltu a5, a5, a6 ; RV32IM-NEXT: mulhu a3, a1, a3 -; RV32IM-NEXT: add a5, a5, a7 +; RV32IM-NEXT: add a3, a3, a5 +; RV32IM-NEXT: add a3, a3, a7 ; RV32IM-NEXT: mul a1, a4, a1 ; RV32IM-NEXT: mulhu a0, a4, a0 -; RV32IM-NEXT: add a1, a1, t1 ; RV32IM-NEXT: add a0, a0, a1 -; RV32IM-NEXT: add a0, a5, a0 -; RV32IM-NEXT: add a0, a0, t2 -; RV32IM-NEXT: add a1, a3, a0 +; RV32IM-NEXT: add a0, a0, t1 +; RV32IM-NEXT: add a0, a3, a0 +; RV32IM-NEXT: add a1, a0, t2 ; RV32IM-NEXT: mv a0, a2 ; RV32IM-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll --- a/llvm/test/CodeGen/RISCV/neg-abs.ll +++ b/llvm/test/CodeGen/RISCV/neg-abs.ll @@ -83,8 +83,8 @@ ; RV32I-NEXT: xor a0, a0, a2 ; RV32I-NEXT: sltu a3, a2, a0 ; RV32I-NEXT: xor a1, a1, a2 -; RV32I-NEXT: add a1, a1, a3 ; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a0, a2, a0 ; RV32I-NEXT: ret ; @@ -94,8 +94,8 @@ ; RV32ZBB-NEXT: xor a0, a0, a2 ; RV32ZBB-NEXT: sltu a3, a2, a0 ; RV32ZBB-NEXT: xor a1, a1, a2 -; RV32ZBB-NEXT: add a1, a1, a3 ; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a0, a2, a0 ; RV32ZBB-NEXT: ret ; @@ -123,8 +123,8 @@ ; RV32I-NEXT: xor a0, a0, a2 ; RV32I-NEXT: sltu a3, a2, a0 ; RV32I-NEXT: xor a1, a1, a2 -; RV32I-NEXT: add a1, a1, a3 ; RV32I-NEXT: sub a1, a2, a1 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: sub a0, a2, a0 ; RV32I-NEXT: ret ; @@ -134,8 +134,8 @@ ; RV32ZBB-NEXT: xor a0, a0, a2 ; RV32ZBB-NEXT: sltu a3, a2, a0 ; RV32ZBB-NEXT: xor a1, a1, a2 -; RV32ZBB-NEXT: add a1, a1, a3 ; RV32ZBB-NEXT: sub a1, a2, a1 +; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: sub a0, a2, a0 ; RV32ZBB-NEXT: ret ; @@ -204,14 +204,14 @@ ; RV32I-NEXT: bgez a1, .LBB5_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: snez a3, a0 -; RV32I-NEXT: neg a3, a3 -; RV32I-NEXT: sub a1, a3, a1 +; RV32I-NEXT: neg a1, a1 +; RV32I-NEXT: sub a1, a1, a3 ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: .LBB5_2: ; RV32I-NEXT: sw a0, 0(a2) ; RV32I-NEXT: snez a3, a0 -; RV32I-NEXT: neg a3, a3 -; RV32I-NEXT: sub a3, a3, a1 +; RV32I-NEXT: neg a4, a1 +; RV32I-NEXT: sub a3, a4, a3 ; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: sw a1, 4(a2) ; RV32I-NEXT: mv a1, a3 @@ -222,14 +222,14 @@ ; RV32ZBB-NEXT: bgez a1, .LBB5_2 ; RV32ZBB-NEXT: # %bb.1: ; RV32ZBB-NEXT: snez a3, a0 -; RV32ZBB-NEXT: neg a3, a3 -; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: neg a1, a1 +; RV32ZBB-NEXT: sub a1, a1, a3 ; RV32ZBB-NEXT: neg a0, a0 ; RV32ZBB-NEXT: .LBB5_2: ; RV32ZBB-NEXT: sw a0, 0(a2) ; RV32ZBB-NEXT: snez a3, a0 -; RV32ZBB-NEXT: neg a3, a3 -; RV32ZBB-NEXT: sub a3, a3, a1 +; RV32ZBB-NEXT: neg a4, a1 +; RV32ZBB-NEXT: sub a3, a4, a3 ; RV32ZBB-NEXT: neg a0, a0 ; RV32ZBB-NEXT: sw a1, 4(a2) ; RV32ZBB-NEXT: mv a1, a3 diff --git a/llvm/test/CodeGen/RISCV/rotl-rotr.ll b/llvm/test/CodeGen/RISCV/rotl-rotr.ll --- a/llvm/test/CodeGen/RISCV/rotl-rotr.ll +++ b/llvm/test/CodeGen/RISCV/rotl-rotr.ll @@ -1076,8 +1076,8 @@ ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: add a0, a1, a0 ; RV32I-NEXT: sltu a1, a0, a1 +; RV32I-NEXT: add a3, a5, a3 ; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: add a1, a5, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: rotl_64_mask_shared: @@ -1131,8 +1131,8 @@ ; RV32ZBB-NEXT: and a0, a0, a2 ; RV32ZBB-NEXT: add a0, a1, a0 ; RV32ZBB-NEXT: sltu a1, a0, a1 +; RV32ZBB-NEXT: add a3, a5, a3 ; RV32ZBB-NEXT: add a1, a3, a1 -; RV32ZBB-NEXT: add a1, a5, a1 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: rotl_64_mask_shared: @@ -1232,7 +1232,7 @@ ; RV32I-NEXT: and a0, a0, a2 ; RV32I-NEXT: add a0, a6, a0 ; RV32I-NEXT: sltu a2, a0, a6 -; RV32I-NEXT: add a2, a3, a2 +; RV32I-NEXT: add a1, a1, a3 ; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; @@ -1286,7 +1286,7 @@ ; RV32ZBB-NEXT: and a0, a0, a2 ; RV32ZBB-NEXT: add a0, a6, a0 ; RV32ZBB-NEXT: sltu a2, a0, a6 -; RV32ZBB-NEXT: add a2, a3, a2 +; RV32ZBB-NEXT: add a1, a1, a3 ; RV32ZBB-NEXT: add a1, a1, a2 ; RV32ZBB-NEXT: ret ; @@ -1369,14 +1369,14 @@ ; RV32I-NEXT: srl t0, t0, a1 ; RV32I-NEXT: sll t1, a0, a4 ; RV32I-NEXT: srli a0, a6, 1 -; RV32I-NEXT: srl a6, a0, a1 +; RV32I-NEXT: srl t2, a0, a1 ; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: bnez a5, .LBB21_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: .LBB21_6: -; RV32I-NEXT: or a7, a7, t0 -; RV32I-NEXT: or a6, t1, a6 +; RV32I-NEXT: or a6, a7, t0 +; RV32I-NEXT: or a7, t1, t2 ; RV32I-NEXT: sll t0, a0, a4 ; RV32I-NEXT: bnez a5, .LBB21_8 ; RV32I-NEXT: # %bb.7: @@ -1388,11 +1388,11 @@ ; RV32I-NEXT: sll a2, a2, a4 ; RV32I-NEXT: srli a0, a0, 1 ; RV32I-NEXT: srl a0, a0, a1 -; RV32I-NEXT: or a2, a2, a0 -; RV32I-NEXT: add a0, a7, a3 -; RV32I-NEXT: sltu a1, a0, a7 -; RV32I-NEXT: add a1, a2, a1 -; RV32I-NEXT: add a1, a6, a1 +; RV32I-NEXT: or a0, a2, a0 +; RV32I-NEXT: add a1, a7, a0 +; RV32I-NEXT: add a0, a6, a3 +; RV32I-NEXT: sltu a2, a0, a6 +; RV32I-NEXT: add a1, a1, a2 ; RV32I-NEXT: ret ; ; RV64I-LABEL: rotl_64_mask_multiple: @@ -1426,14 +1426,14 @@ ; RV32ZBB-NEXT: srl t0, t0, a1 ; RV32ZBB-NEXT: sll t1, a0, a4 ; RV32ZBB-NEXT: srli a0, a6, 1 -; RV32ZBB-NEXT: srl a6, a0, a1 +; RV32ZBB-NEXT: srl t2, a0, a1 ; RV32ZBB-NEXT: mv a0, a3 ; RV32ZBB-NEXT: bnez a5, .LBB21_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: mv a0, a2 ; RV32ZBB-NEXT: .LBB21_6: -; RV32ZBB-NEXT: or a7, a7, t0 -; RV32ZBB-NEXT: or a6, t1, a6 +; RV32ZBB-NEXT: or a6, a7, t0 +; RV32ZBB-NEXT: or a7, t1, t2 ; RV32ZBB-NEXT: sll t0, a0, a4 ; RV32ZBB-NEXT: bnez a5, .LBB21_8 ; RV32ZBB-NEXT: # %bb.7: @@ -1445,11 +1445,11 @@ ; RV32ZBB-NEXT: sll a2, a2, a4 ; RV32ZBB-NEXT: srli a0, a0, 1 ; RV32ZBB-NEXT: srl a0, a0, a1 -; RV32ZBB-NEXT: or a2, a2, a0 -; RV32ZBB-NEXT: add a0, a7, a3 -; RV32ZBB-NEXT: sltu a1, a0, a7 -; RV32ZBB-NEXT: add a1, a2, a1 -; RV32ZBB-NEXT: add a1, a6, a1 +; RV32ZBB-NEXT: or a0, a2, a0 +; RV32ZBB-NEXT: add a1, a7, a0 +; RV32ZBB-NEXT: add a0, a6, a3 +; RV32ZBB-NEXT: sltu a2, a0, a6 +; RV32ZBB-NEXT: add a1, a1, a2 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: rotl_64_mask_multiple: @@ -1527,16 +1527,16 @@ ; RV32I-NEXT: slli t0, a1, 1 ; RV32I-NEXT: not a0, a4 ; RV32I-NEXT: sll t0, t0, a0 -; RV32I-NEXT: srl a1, a1, a4 +; RV32I-NEXT: srl t1, a1, a4 ; RV32I-NEXT: slli a6, a6, 1 -; RV32I-NEXT: sll t1, a6, a0 +; RV32I-NEXT: sll t2, a6, a0 ; RV32I-NEXT: mv a6, a2 ; RV32I-NEXT: beqz a5, .LBB23_6 ; RV32I-NEXT: # %bb.5: ; RV32I-NEXT: mv a6, a3 ; RV32I-NEXT: .LBB23_6: -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or a1, t1, a1 +; RV32I-NEXT: or a1, t0, a7 +; RV32I-NEXT: or a7, t2, t1 ; RV32I-NEXT: srl t0, a6, a4 ; RV32I-NEXT: beqz a5, .LBB23_8 ; RV32I-NEXT: # %bb.7: @@ -1548,11 +1548,11 @@ ; RV32I-NEXT: srl a3, a3, a4 ; RV32I-NEXT: slli a6, a6, 1 ; RV32I-NEXT: sll a0, a6, a0 -; RV32I-NEXT: or a3, a0, a3 -; RV32I-NEXT: add a0, a7, a2 -; RV32I-NEXT: sltu a2, a0, a7 -; RV32I-NEXT: add a2, a3, a2 -; RV32I-NEXT: add a1, a1, a2 +; RV32I-NEXT: or a0, a0, a3 +; RV32I-NEXT: add a7, a7, a0 +; RV32I-NEXT: add a0, a1, a2 +; RV32I-NEXT: sltu a1, a0, a1 +; RV32I-NEXT: add a1, a7, a1 ; RV32I-NEXT: ret ; ; RV64I-LABEL: rotr_64_mask_multiple: @@ -1583,16 +1583,16 @@ ; RV32ZBB-NEXT: slli t0, a1, 1 ; RV32ZBB-NEXT: not a0, a4 ; RV32ZBB-NEXT: sll t0, t0, a0 -; RV32ZBB-NEXT: srl a1, a1, a4 +; RV32ZBB-NEXT: srl t1, a1, a4 ; RV32ZBB-NEXT: slli a6, a6, 1 -; RV32ZBB-NEXT: sll t1, a6, a0 +; RV32ZBB-NEXT: sll t2, a6, a0 ; RV32ZBB-NEXT: mv a6, a2 ; RV32ZBB-NEXT: beqz a5, .LBB23_6 ; RV32ZBB-NEXT: # %bb.5: ; RV32ZBB-NEXT: mv a6, a3 ; RV32ZBB-NEXT: .LBB23_6: -; RV32ZBB-NEXT: or a7, t0, a7 -; RV32ZBB-NEXT: or a1, t1, a1 +; RV32ZBB-NEXT: or a1, t0, a7 +; RV32ZBB-NEXT: or a7, t2, t1 ; RV32ZBB-NEXT: srl t0, a6, a4 ; RV32ZBB-NEXT: beqz a5, .LBB23_8 ; RV32ZBB-NEXT: # %bb.7: @@ -1604,11 +1604,11 @@ ; RV32ZBB-NEXT: srl a3, a3, a4 ; RV32ZBB-NEXT: slli a6, a6, 1 ; RV32ZBB-NEXT: sll a0, a6, a0 -; RV32ZBB-NEXT: or a3, a0, a3 -; RV32ZBB-NEXT: add a0, a7, a2 -; RV32ZBB-NEXT: sltu a2, a0, a7 -; RV32ZBB-NEXT: add a2, a3, a2 -; RV32ZBB-NEXT: add a1, a1, a2 +; RV32ZBB-NEXT: or a0, a0, a3 +; RV32ZBB-NEXT: add a7, a7, a0 +; RV32ZBB-NEXT: add a0, a1, a2 +; RV32ZBB-NEXT: sltu a1, a0, a1 +; RV32ZBB-NEXT: add a1, a7, a1 ; RV32ZBB-NEXT: ret ; ; RV64ZBB-LABEL: rotr_64_mask_multiple: diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll --- a/llvm/test/CodeGen/RISCV/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll @@ -720,8 +720,8 @@ ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: snez a2, a0 ; CHECK-NEXT: neg a0, a0 -; CHECK-NEXT: neg a2, a2 -; CHECK-NEXT: sub a1, a2, a1 +; CHECK-NEXT: neg a1, a1 +; CHECK-NEXT: sub a1, a1, a2 ; CHECK-NEXT: .LBB19_2: ; CHECK-NEXT: ret %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true) @@ -774,7 +774,7 @@ ; RV32I-NEXT: and a2, a0, a2 ; RV32I-NEXT: slli a2, a2, 8 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a1, a2, a1 +; RV32I-NEXT: or a0, a0, a2 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret ; @@ -800,7 +800,7 @@ ; RV32I-NEXT: and a4, a1, a3 ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a2, a4, a2 +; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: or a2, a1, a2 ; RV32I-NEXT: srli a1, a0, 8 ; RV32I-NEXT: and a1, a1, a3 @@ -809,7 +809,7 @@ ; RV32I-NEXT: and a3, a0, a3 ; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll --- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll @@ -16,8 +16,8 @@ ; CHECK-NEXT: slli a2, a2, 32 ; CHECK-NEXT: mulhu a1, a2, a1 ; CHECK-NEXT: srli a1, a1, 1 -; CHECK-NEXT: add a0, a0, a1 -; CHECK-NEXT: addw a0, a3, a0 +; CHECK-NEXT: add a0, a3, a0 +; CHECK-NEXT: addw a0, a0, a1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: li a0, 0 @@ -61,8 +61,8 @@ ; CHECK-NEXT: slli a3, a3, 32 ; CHECK-NEXT: mulhu a1, a3, a1 ; CHECK-NEXT: srli a1, a1, 1 -; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: subw a0, a2, a0 +; CHECK-NEXT: subw a0, a0, a1 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: li a0, 0 diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll --- a/llvm/test/CodeGen/RISCV/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll @@ -957,7 +957,7 @@ ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: slliw a0, a0, 24 -; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; @@ -983,7 +983,7 @@ ; RV64I-NEXT: and a3, a0, a3 ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a2, a3, a2 +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: sw a0, 0(a1) ; RV64I-NEXT: ret @@ -1016,8 +1016,8 @@ ; RV64I-NEXT: srli a5, a0, 8 ; RV64I-NEXT: srliw a5, a5, 24 ; RV64I-NEXT: slli a5, a5, 24 +; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: or a1, a5, a1 ; RV64I-NEXT: and a4, a0, a4 ; RV64I-NEXT: slli a4, a4, 24 ; RV64I-NEXT: srliw a3, a0, 24 @@ -1026,8 +1026,8 @@ ; RV64I-NEXT: and a2, a0, a2 ; RV64I-NEXT: slli a2, a2, 40 ; RV64I-NEXT: slli a0, a0, 56 -; RV64I-NEXT: or a2, a2, a3 -; RV64I-NEXT: or a1, a2, a1 +; RV64I-NEXT: or a0, a0, a2 +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll --- a/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll +++ b/llvm/test/CodeGen/RISCV/rvv/expand-no-v.ll @@ -36,8 +36,8 @@ ; RV32-NEXT: neg a2, a2 ; RV32-NEXT: and a2, a2, a4 ; RV32-NEXT: add a2, a2, a3 -; RV32-NEXT: add a2, a6, a2 -; RV32-NEXT: add a0, a2, a0 +; RV32-NEXT: add a1, a1, a6 +; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: add a0, a1, a0 ; RV32-NEXT: ret ; @@ -72,8 +72,8 @@ ; RV64-NEXT: negw a2, a2 ; RV64-NEXT: and a2, a2, a4 ; RV64-NEXT: add a2, a2, a3 -; RV64-NEXT: add a2, a6, a2 -; RV64-NEXT: add a0, a2, a0 +; RV64-NEXT: add a1, a1, a6 +; RV64-NEXT: add a1, a1, a2 ; RV64-NEXT: addw a0, a1, a0 ; RV64-NEXT: ret %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %s, <4 x i32> %v, <4 x i1> %m, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-elen.ll @@ -30,20 +30,20 @@ ; RV32-NEXT: lw a3, 12(a0) ; RV32-NEXT: lw a4, 0(a0) ; RV32-NEXT: lw a5, 4(a0) -; RV32-NEXT: lw a6, 0(a1) -; RV32-NEXT: lw a7, 8(a1) -; RV32-NEXT: lw t0, 4(a1) +; RV32-NEXT: lw a6, 4(a1) +; RV32-NEXT: lw a7, 0(a1) +; RV32-NEXT: lw t0, 8(a1) ; RV32-NEXT: lw a1, 12(a1) -; RV32-NEXT: add a6, a4, a6 -; RV32-NEXT: sltu a4, a6, a4 -; RV32-NEXT: add a4, t0, a4 +; RV32-NEXT: add a5, a5, a6 +; RV32-NEXT: add a7, a4, a7 +; RV32-NEXT: sltu a4, a7, a4 ; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: add a7, a2, a7 -; RV32-NEXT: sltu a2, a7, a2 -; RV32-NEXT: add a1, a1, a2 ; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: sw a7, 8(a0) -; RV32-NEXT: sw a6, 0(a0) +; RV32-NEXT: add t0, a2, t0 +; RV32-NEXT: sltu a2, t0, a2 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: sw t0, 8(a0) +; RV32-NEXT: sw a7, 0(a0) ; RV32-NEXT: sw a1, 12(a0) ; RV32-NEXT: sw a4, 4(a0) ; RV32-NEXT: ret @@ -88,15 +88,15 @@ ; RV32-LABEL: add_v1i64: ; RV32: # %bb.0: ; RV32-NEXT: lw a2, 0(a0) -; RV32-NEXT: lw a3, 0(a1) -; RV32-NEXT: lw a1, 4(a1) -; RV32-NEXT: lw a4, 4(a0) -; RV32-NEXT: add a3, a2, a3 -; RV32-NEXT: sltu a2, a3, a2 -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: add a1, a4, a1 -; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: sw a1, 4(a0) +; RV32-NEXT: lw a3, 4(a0) +; RV32-NEXT: lw a4, 4(a1) +; RV32-NEXT: lw a1, 0(a1) +; RV32-NEXT: add a3, a3, a4 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: sltu a2, a1, a2 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: sw a1, 0(a0) +; RV32-NEXT: sw a2, 4(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: add_v1i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-mask-vp.ll @@ -244,17 +244,17 @@ ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vcpop.m a2, v11, v0.t ; CHECK-NEXT: seqz a2, a2 -; CHECK-NEXT: addi a3, a1, -128 -; CHECK-NEXT: sltu a1, a1, a3 +; CHECK-NEXT: and a0, a2, a0 +; CHECK-NEXT: addi a2, a1, -128 +; CHECK-NEXT: sltu a1, a1, a2 ; CHECK-NEXT: addi a1, a1, -1 -; CHECK-NEXT: and a1, a1, a3 +; CHECK-NEXT: and a1, a1, a2 ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vcpop.m a1, v8, v0.t ; CHECK-NEXT: seqz a1, a1 ; CHECK-NEXT: and a0, a1, a0 -; CHECK-NEXT: and a0, a0, a2 ; CHECK-NEXT: ret %r = call i1 @llvm.vp.reduce.and.v256i1(i1 %s, <256 x i1> %v, <256 x i1> %m, i32 %evl) ret i1 %r diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll @@ -425,8 +425,8 @@ ; RV32-NEXT: or a3, a3, a4 ; RV32-NEXT: slli a5, a5, 16 ; RV32-NEXT: slli a6, a6, 24 -; RV32-NEXT: or a3, a5, a3 -; RV32-NEXT: or a3, a6, a3 +; RV32-NEXT: or a4, a6, a5 +; RV32-NEXT: or a3, a4, a3 ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: andi a2, a2, 2 @@ -446,7 +446,7 @@ ; RV32-NEXT: or a2, a2, a3 ; RV32-NEXT: slli a4, a4, 16 ; RV32-NEXT: slli a0, a0, 24 -; RV32-NEXT: or a2, a4, a2 +; RV32-NEXT: or a0, a0, a4 ; RV32-NEXT: or a0, a0, a2 ; RV32-NEXT: vmv.s.x v9, a0 ; RV32-NEXT: vslideup.vi v8, v9, 1 @@ -471,8 +471,8 @@ ; RV64-NEXT: or a3, a3, a4 ; RV64-NEXT: slli a5, a5, 16 ; RV64-NEXT: slli a6, a6, 24 -; RV64-NEXT: or a3, a5, a3 -; RV64-NEXT: or a3, a6, a3 +; RV64-NEXT: or a4, a6, a5 +; RV64-NEXT: or a3, a4, a3 ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vmv.v.x v8, a3 ; RV64-NEXT: andi a2, a2, 2 @@ -492,7 +492,7 @@ ; RV64-NEXT: or a2, a2, a3 ; RV64-NEXT: slli a4, a4, 16 ; RV64-NEXT: slli a0, a0, 24 -; RV64-NEXT: or a2, a4, a2 +; RV64-NEXT: or a0, a0, a4 ; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: vmv.s.x v9, a0 ; RV64-NEXT: vslideup.vi v8, v9, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-mask-vp.ll @@ -378,8 +378,8 @@ ; CHECK-NEXT: vmv1r.v v0, v9 ; CHECK-NEXT: vcpop.m a1, v11, v0.t ; CHECK-NEXT: snez a1, a1 +; CHECK-NEXT: or a0, a1, a0 ; CHECK-NEXT: or a0, a3, a0 -; CHECK-NEXT: or a0, a0, a1 ; CHECK-NEXT: ret %r = call i1 @llvm.vp.reduce.or.nxv128i1(i1 %s, %v, %m, i32 %evl) ret i1 %r diff --git a/llvm/test/CodeGen/RISCV/sadd_sat.ll b/llvm/test/CodeGen/RISCV/sadd_sat.ll --- a/llvm/test/CodeGen/RISCV/sadd_sat.ll +++ b/llvm/test/CodeGen/RISCV/sadd_sat.ll @@ -59,10 +59,10 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: mv a4, a1 ; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: add a5, a4, a3 ; RV32I-NEXT: add a0, a0, a2 ; RV32I-NEXT: sltu a1, a0, a1 -; RV32I-NEXT: add a1, a3, a1 -; RV32I-NEXT: add a1, a4, a1 +; RV32I-NEXT: add a1, a5, a1 ; RV32I-NEXT: xor a2, a4, a1 ; RV32I-NEXT: xor a3, a4, a3 ; RV32I-NEXT: not a3, a3 @@ -94,10 +94,10 @@ ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: mv a4, a1 ; RV32IZbb-NEXT: mv a1, a0 +; RV32IZbb-NEXT: add a5, a4, a3 ; RV32IZbb-NEXT: add a0, a0, a2 ; RV32IZbb-NEXT: sltu a1, a0, a1 -; RV32IZbb-NEXT: add a1, a3, a1 -; RV32IZbb-NEXT: add a1, a4, a1 +; RV32IZbb-NEXT: add a1, a5, a1 ; RV32IZbb-NEXT: xor a2, a4, a1 ; RV32IZbb-NEXT: xor a3, a4, a3 ; RV32IZbb-NEXT: andn a2, a2, a3 diff --git a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll --- a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll +++ b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll @@ -65,10 +65,10 @@ ; RV32I: # %bb.0: ; RV32I-NEXT: mv a2, a1 ; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: add a3, a2, a5 ; RV32I-NEXT: add a0, a0, a4 ; RV32I-NEXT: sltu a1, a0, a1 -; RV32I-NEXT: add a1, a5, a1 -; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: add a1, a3, a1 ; RV32I-NEXT: xor a3, a2, a1 ; RV32I-NEXT: xor a2, a2, a5 ; RV32I-NEXT: not a2, a2 @@ -100,10 +100,10 @@ ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: mv a2, a1 ; RV32IZbb-NEXT: mv a1, a0 +; RV32IZbb-NEXT: add a3, a2, a5 ; RV32IZbb-NEXT: add a0, a0, a4 ; RV32IZbb-NEXT: sltu a1, a0, a1 -; RV32IZbb-NEXT: add a1, a5, a1 -; RV32IZbb-NEXT: add a1, a2, a1 +; RV32IZbb-NEXT: add a1, a3, a1 ; RV32IZbb-NEXT: xor a3, a2, a1 ; RV32IZbb-NEXT: xor a2, a2, a5 ; RV32IZbb-NEXT: andn a2, a3, a2 diff --git a/llvm/test/CodeGen/RISCV/select-binop-identity.ll b/llvm/test/CodeGen/RISCV/select-binop-identity.ll --- a/llvm/test/CodeGen/RISCV/select-binop-identity.ll +++ b/llvm/test/CodeGen/RISCV/select-binop-identity.ll @@ -272,7 +272,7 @@ ; RV32I-NEXT: and a1, a0, a1 ; RV32I-NEXT: add a0, a1, a3 ; RV32I-NEXT: sltu a1, a0, a1 -; RV32I-NEXT: add a1, a4, a1 +; RV32I-NEXT: add a2, a2, a4 ; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: ret ; @@ -343,7 +343,7 @@ ; RV32I-NEXT: and a2, a0, a2 ; RV32I-NEXT: and a0, a0, a1 ; RV32I-NEXT: sltu a1, a3, a0 -; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: sub a4, a4, a2 ; RV32I-NEXT: sub a1, a4, a1 ; RV32I-NEXT: sub a0, a3, a0 ; RV32I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/shadowcallstack.ll b/llvm/test/CodeGen/RISCV/shadowcallstack.ll --- a/llvm/test/CodeGen/RISCV/shadowcallstack.ll +++ b/llvm/test/CodeGen/RISCV/shadowcallstack.ll @@ -88,8 +88,8 @@ ; RV32-NEXT: call bar@plt ; RV32-NEXT: mv s3, a0 ; RV32-NEXT: call bar@plt +; RV32-NEXT: add s0, s0, s1 ; RV32-NEXT: add a0, s3, a0 -; RV32-NEXT: add a0, s1, a0 ; RV32-NEXT: add a0, s0, a0 ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -121,8 +121,8 @@ ; RV64-NEXT: call bar@plt ; RV64-NEXT: mv s3, a0 ; RV64-NEXT: call bar@plt +; RV64-NEXT: add s0, s0, s1 ; RV64-NEXT: add a0, s3, a0 -; RV64-NEXT: add a0, s1, a0 ; RV64-NEXT: addw a0, s0, a0 ; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -215,8 +215,8 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: andi a2, a2, 7 ; RV32I-NEXT: srl a3, a3, a2 ; RV32I-NEXT: lbu a4, 5(a1) @@ -227,8 +227,8 @@ ; RV32I-NEXT: or a4, a4, a5 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: slli a5, a4, 1 ; RV32I-NEXT: xori a6, a2, 31 ; RV32I-NEXT: sll a5, a5, a6 @@ -242,8 +242,8 @@ ; RV32I-NEXT: or a5, a5, a7 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a5, t1, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: slli a7, a5, 1 ; RV32I-NEXT: not t0, a2 ; RV32I-NEXT: lbu t1, 13(a1) @@ -257,7 +257,7 @@ ; RV32I-NEXT: srl a5, a5, a2 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or a1, a1, t0 ; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: slli a7, a1, 1 ; RV32I-NEXT: sll a6, a7, a6 @@ -362,8 +362,8 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: andi a2, a2, 7 ; RV32I-NEXT: srl a3, a3, a2 ; RV32I-NEXT: lbu a4, 5(a1) @@ -374,8 +374,8 @@ ; RV32I-NEXT: or a4, a4, a5 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: or a4, a7, a4 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a4, a5, a4 ; RV32I-NEXT: slli a5, a4, 1 ; RV32I-NEXT: xori a6, a2, 31 ; RV32I-NEXT: sll a5, a5, a6 @@ -389,8 +389,8 @@ ; RV32I-NEXT: or a5, a5, a7 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a5, t1, a5 +; RV32I-NEXT: or a7, t1, t0 +; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: slli a7, a5, 1 ; RV32I-NEXT: not t0, a2 ; RV32I-NEXT: lbu t1, 13(a1) @@ -404,7 +404,7 @@ ; RV32I-NEXT: srl a5, a5, a2 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a7, t0, a7 +; RV32I-NEXT: or a1, a1, t0 ; RV32I-NEXT: or a1, a1, a7 ; RV32I-NEXT: slli a7, a1, 1 ; RV32I-NEXT: sll a6, a7, a6 @@ -504,8 +504,8 @@ ; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a1, a5, a1 -; RV32I-NEXT: or a1, a6, a1 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a1, a4, a1 ; RV32I-NEXT: andi a2, a2, 7 ; RV32I-NEXT: sll a4, a1, a2 ; RV32I-NEXT: lbu a5, 1(a3) @@ -516,8 +516,8 @@ ; RV32I-NEXT: or a5, a5, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a5, a6, a5 ; RV32I-NEXT: srli a6, a5, 1 ; RV32I-NEXT: xori a7, a2, 31 ; RV32I-NEXT: srl a6, a6, a7 @@ -530,8 +530,8 @@ ; RV32I-NEXT: or a6, a6, t0 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: or a6, t2, a6 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a6, t0, a6 ; RV32I-NEXT: sll t0, a6, a2 ; RV32I-NEXT: srli a1, a1, 1 ; RV32I-NEXT: not t1, a2 @@ -545,7 +545,7 @@ ; RV32I-NEXT: or t0, t0, t1 ; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or t0, t2, t0 +; RV32I-NEXT: or a3, a3, t2 ; RV32I-NEXT: or a3, a3, t0 ; RV32I-NEXT: sll a3, a3, a2 ; RV32I-NEXT: srli a6, a6, 1 diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll --- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll @@ -21,11 +21,11 @@ ; RV32-NEXT: addi a3, a3, -1366 ; RV32-NEXT: mul a3, a5, a3 ; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a0, a1, a4 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: add a1, a6, a0 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; @@ -46,11 +46,11 @@ ; RV64-NEXT: sub a4, a0, a3 ; RV64-NEXT: mul a5, a4, a6 ; RV64-NEXT: mulhu a6, a4, a2 +; RV64-NEXT: add a5, a6, a5 ; RV64-NEXT: sltu a0, a0, a3 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a0, a1, a2 -; RV64-NEXT: add a0, a5, a0 -; RV64-NEXT: add a1, a6, a0 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, a5, a1 ; RV64-NEXT: mul a0, a4, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 3 @@ -74,11 +74,11 @@ ; RV32-NEXT: addi a3, a3, -820 ; RV32-NEXT: mul a3, a5, a3 ; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a0, a1, a4 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: add a1, a6, a0 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; @@ -99,11 +99,11 @@ ; RV64-NEXT: sub a4, a0, a3 ; RV64-NEXT: mul a5, a4, a6 ; RV64-NEXT: mulhu a6, a4, a2 +; RV64-NEXT: add a5, a6, a5 ; RV64-NEXT: sltu a0, a0, a3 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a0, a1, a2 -; RV64-NEXT: add a0, a5, a0 -; RV64-NEXT: add a1, a6, a0 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, a5, a1 ; RV64-NEXT: mul a0, a4, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 5 @@ -181,11 +181,11 @@ ; RV32-NEXT: mul a5, a3, a5 ; RV32-NEXT: addi a4, a4, -273 ; RV32-NEXT: mulhu a6, a3, a4 +; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a0, a1, a4 -; RV32-NEXT: add a0, a5, a0 -; RV32-NEXT: add a1, a6, a0 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, a5, a1 ; RV32-NEXT: mul a0, a3, a4 ; RV32-NEXT: ret ; @@ -208,11 +208,11 @@ ; RV64-NEXT: sub a3, a0, a2 ; RV64-NEXT: mul a4, a3, a4 ; RV64-NEXT: mulhu a6, a3, a5 +; RV64-NEXT: add a4, a6, a4 ; RV64-NEXT: sltu a0, a0, a2 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a0, a1, a5 -; RV64-NEXT: add a0, a4, a0 -; RV64-NEXT: add a1, a6, a0 +; RV64-NEXT: mul a1, a1, a5 +; RV64-NEXT: add a1, a4, a1 ; RV64-NEXT: mul a0, a3, a5 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 15 @@ -236,11 +236,11 @@ ; RV32-NEXT: addi a3, a3, 240 ; RV32-NEXT: mul a3, a5, a3 ; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a0, a1, a4 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: add a1, a6, a0 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; @@ -261,11 +261,11 @@ ; RV64-NEXT: sub a4, a0, a3 ; RV64-NEXT: mul a5, a4, a6 ; RV64-NEXT: mulhu a6, a4, a2 +; RV64-NEXT: add a5, a6, a5 ; RV64-NEXT: sltu a0, a0, a3 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a0, a1, a2 -; RV64-NEXT: add a0, a5, a0 -; RV64-NEXT: add a1, a6, a0 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, a5, a1 ; RV64-NEXT: mul a0, a4, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 17 @@ -291,11 +291,11 @@ ; RV32-NEXT: mul a5, a3, a5 ; RV32-NEXT: addi a4, a4, -257 ; RV32-NEXT: mulhu a6, a3, a4 +; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a0, a1, a4 -; RV32-NEXT: add a0, a5, a0 -; RV32-NEXT: add a1, a6, a0 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, a5, a1 ; RV32-NEXT: mul a0, a3, a4 ; RV32-NEXT: ret ; @@ -318,11 +318,11 @@ ; RV64-NEXT: sub a3, a0, a2 ; RV64-NEXT: mul a4, a3, a4 ; RV64-NEXT: mulhu a6, a3, a5 +; RV64-NEXT: add a4, a6, a4 ; RV64-NEXT: sltu a0, a0, a2 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a0, a1, a5 -; RV64-NEXT: add a0, a4, a0 -; RV64-NEXT: add a1, a6, a0 +; RV64-NEXT: mul a1, a1, a5 +; RV64-NEXT: add a1, a4, a1 ; RV64-NEXT: mul a0, a3, a5 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 255 @@ -346,11 +346,11 @@ ; RV32-NEXT: addi a3, a3, -256 ; RV32-NEXT: mul a3, a5, a3 ; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a0, a1, a4 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: add a1, a6, a0 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; @@ -371,11 +371,11 @@ ; RV64-NEXT: sub a4, a0, a3 ; RV64-NEXT: mul a5, a4, a6 ; RV64-NEXT: mulhu a6, a4, a2 +; RV64-NEXT: add a5, a6, a5 ; RV64-NEXT: sltu a0, a0, a3 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a0, a1, a2 -; RV64-NEXT: add a0, a5, a0 -; RV64-NEXT: add a1, a6, a0 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, a5, a1 ; RV64-NEXT: mul a0, a4, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 257 @@ -401,12 +401,12 @@ ; RV32-NEXT: mul a5, a3, a5 ; RV32-NEXT: addi a4, a4, -1 ; RV32-NEXT: mulhu a4, a3, a4 +; RV32-NEXT: add a4, a4, a5 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a1, a1, a0 ; RV32-NEXT: slli a0, a1, 16 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: sub a1, a5, a0 -; RV32-NEXT: add a1, a4, a1 +; RV32-NEXT: sub a1, a4, a0 ; RV32-NEXT: slli a0, a3, 16 ; RV32-NEXT: neg a2, a3 ; RV32-NEXT: sub a0, a2, a0 @@ -433,11 +433,11 @@ ; RV64-NEXT: mul a5, a3, a5 ; RV64-NEXT: addi a4, a4, -1 ; RV64-NEXT: mulhu a6, a3, a4 +; RV64-NEXT: add a5, a6, a5 ; RV64-NEXT: sltu a0, a0, a2 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a0, a1, a4 -; RV64-NEXT: add a0, a5, a0 -; RV64-NEXT: add a1, a6, a0 +; RV64-NEXT: mul a1, a1, a4 +; RV64-NEXT: add a1, a5, a1 ; RV64-NEXT: mul a0, a3, a4 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 65535 @@ -460,12 +460,12 @@ ; RV32-NEXT: sub a3, a0, a2 ; RV32-NEXT: mulhu a4, a3, a4 ; RV32-NEXT: slli a5, a3, 16 +; RV32-NEXT: sub a4, a4, a5 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a1, a1, a0 ; RV32-NEXT: slli a0, a1, 16 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: sub a0, a5, a1 -; RV32-NEXT: sub a1, a4, a0 +; RV32-NEXT: add a1, a4, a1 ; RV32-NEXT: sub a0, a3, a5 ; RV32-NEXT: ret ; @@ -488,11 +488,11 @@ ; RV64-NEXT: sub a5, a0, a2 ; RV64-NEXT: mul a3, a5, a3 ; RV64-NEXT: mulhu a6, a5, a4 +; RV64-NEXT: add a3, a6, a3 ; RV64-NEXT: sltu a0, a0, a2 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a0, a1, a4 -; RV64-NEXT: add a0, a3, a0 -; RV64-NEXT: add a1, a6, a0 +; RV64-NEXT: mul a1, a1, a4 +; RV64-NEXT: add a1, a3, a1 ; RV64-NEXT: mul a0, a5, a4 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 65537 @@ -520,11 +520,11 @@ ; RV32-NEXT: addi a3, a3, -1366 ; RV32-NEXT: mul a3, a5, a3 ; RV32-NEXT: mulhu a6, a5, a4 +; RV32-NEXT: add a3, a6, a3 ; RV32-NEXT: sltu a0, a0, a2 ; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: mul a0, a1, a4 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: add a1, a6, a0 +; RV32-NEXT: mul a1, a1, a4 +; RV32-NEXT: add a1, a3, a1 ; RV32-NEXT: mul a0, a5, a4 ; RV32-NEXT: ret ; @@ -549,11 +549,11 @@ ; RV64-NEXT: sub a4, a0, a3 ; RV64-NEXT: mul a5, a4, a6 ; RV64-NEXT: mulhu a6, a4, a2 +; RV64-NEXT: add a5, a6, a5 ; RV64-NEXT: sltu a0, a0, a3 ; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: mul a0, a1, a2 -; RV64-NEXT: add a0, a5, a0 -; RV64-NEXT: add a1, a6, a0 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, a5, a1 ; RV64-NEXT: mul a0, a4, a2 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 12 diff --git a/llvm/test/CodeGen/RISCV/srem-lkk.ll b/llvm/test/CodeGen/RISCV/srem-lkk.ll --- a/llvm/test/CodeGen/RISCV/srem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-lkk.ll @@ -240,7 +240,7 @@ ; RV32IM-NEXT: add a1, a1, a2 ; RV32IM-NEXT: li a2, 95 ; RV32IM-NEXT: mul a2, a1, a2 -; RV32IM-NEXT: sub a2, a2, a1 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: sub a0, a0, a2 ; RV32IM-NEXT: ret ; @@ -278,7 +278,7 @@ ; RV64IM-NEXT: add a1, a1, a2 ; RV64IM-NEXT: li a2, 95 ; RV64IM-NEXT: mulw a2, a1, a2 -; RV64IM-NEXT: subw a2, a2, a1 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: subw a0, a0, a2 ; RV64IM-NEXT: ret %1 = srem i32 %x, 95 diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -366,8 +366,8 @@ ; RV32-NEXT: andi a1, a1, 1 ; RV32-NEXT: slli a1, a1, 1 ; RV32-NEXT: slli a0, a0, 2 -; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: or a0, a2, a0 +; RV32-NEXT: or a0, a0, a1 ; RV32-NEXT: sw a0, 8(s0) ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -516,8 +516,8 @@ ; RV32M-NEXT: andi a1, a1, 1 ; RV32M-NEXT: slli a1, a1, 1 ; RV32M-NEXT: slli a0, a0, 2 -; RV32M-NEXT: or a0, a1, a0 ; RV32M-NEXT: or a0, a2, a0 +; RV32M-NEXT: or a0, a0, a1 ; RV32M-NEXT: sw a0, 8(s0) ; RV32M-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32M-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -561,8 +561,8 @@ ; RV64M-NEXT: srai a4, a4, 1 ; RV64M-NEXT: add a4, a4, a5 ; RV64M-NEXT: slli a5, a4, 3 -; RV64M-NEXT: sub a3, a3, a5 ; RV64M-NEXT: add a3, a3, a4 +; RV64M-NEXT: sub a3, a3, a5 ; RV64M-NEXT: addi a3, a3, -1 ; RV64M-NEXT: seqz a3, a3 ; RV64M-NEXT: lui a4, %hi(.LCPI3_2) @@ -691,8 +691,8 @@ ; RV32MV-NEXT: andi a2, a2, 1 ; RV32MV-NEXT: slli a2, a2, 1 ; RV32MV-NEXT: slli a0, a0, 2 -; RV32MV-NEXT: or a0, a2, a0 ; RV32MV-NEXT: or a0, a1, a0 +; RV32MV-NEXT: or a0, a0, a2 ; RV32MV-NEXT: sw a0, 8(s2) ; RV32MV-NEXT: addi sp, s0, -64 ; RV32MV-NEXT: lw ra, 60(sp) # 4-byte Folded Reload @@ -743,8 +743,8 @@ ; RV64MV-NEXT: slli a4, a3, 3 ; RV64MV-NEXT: lui a5, %hi(.LCPI3_2) ; RV64MV-NEXT: ld a5, %lo(.LCPI3_2)(a5) -; RV64MV-NEXT: sub a2, a2, a4 ; RV64MV-NEXT: add a2, a2, a3 +; RV64MV-NEXT: sub a2, a2, a4 ; RV64MV-NEXT: sd a2, 8(sp) ; RV64MV-NEXT: mulh a2, a1, a5 ; RV64MV-NEXT: srli a3, a2, 63 diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -476,14 +476,14 @@ ; RV32IM-NEXT: srai a5, a5, 6 ; RV32IM-NEXT: add a5, a5, t5 ; RV32IM-NEXT: mul a7, a5, a7 -; RV32IM-NEXT: sub a5, a7, a5 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: sub a5, t4, t3 -; RV32IM-NEXT: sub a3, a3, a5 -; RV32IM-NEXT: sub a5, t2, t1 -; RV32IM-NEXT: sub a1, a1, a5 -; RV32IM-NEXT: sub a5, t0, a6 -; RV32IM-NEXT: sub a4, a4, a5 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: sub a2, a2, a7 +; RV32IM-NEXT: add a3, a3, t3 +; RV32IM-NEXT: sub a3, a3, t4 +; RV32IM-NEXT: add a1, a1, t1 +; RV32IM-NEXT: sub a1, a1, t2 +; RV32IM-NEXT: add a4, a4, a6 +; RV32IM-NEXT: sub a4, a4, t0 ; RV32IM-NEXT: sh a4, 6(a0) ; RV32IM-NEXT: sh a1, 4(a0) ; RV32IM-NEXT: sh a3, 2(a0) @@ -593,18 +593,18 @@ ; RV64IM-NEXT: srai a3, a3, 6 ; RV64IM-NEXT: add a3, a3, t5 ; RV64IM-NEXT: mulw a7, a3, a7 -; RV64IM-NEXT: subw a3, a7, a3 -; RV64IM-NEXT: subw a4, a4, a3 -; RV64IM-NEXT: subw a3, t4, t3 -; RV64IM-NEXT: subw a5, a5, a3 -; RV64IM-NEXT: subw a3, t2, t1 -; RV64IM-NEXT: subw a1, a1, a3 -; RV64IM-NEXT: subw a3, t0, a6 -; RV64IM-NEXT: subw a2, a2, a3 +; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: subw a3, a3, a7 +; RV64IM-NEXT: add a5, a5, t3 +; RV64IM-NEXT: subw a4, a5, t4 +; RV64IM-NEXT: add a1, a1, t1 +; RV64IM-NEXT: subw a1, a1, t2 +; RV64IM-NEXT: add a2, a2, a6 +; RV64IM-NEXT: subw a2, a2, t0 ; RV64IM-NEXT: sh a2, 6(a0) ; RV64IM-NEXT: sh a1, 4(a0) -; RV64IM-NEXT: sh a5, 2(a0) -; RV64IM-NEXT: sh a4, 0(a0) +; RV64IM-NEXT: sh a4, 2(a0) +; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: ret %1 = srem <4 x i16> %x, %2 = sdiv <4 x i16> %x, diff --git a/llvm/test/CodeGen/RISCV/ssub_sat.ll b/llvm/test/CodeGen/RISCV/ssub_sat.ll --- a/llvm/test/CodeGen/RISCV/ssub_sat.ll +++ b/llvm/test/CodeGen/RISCV/ssub_sat.ll @@ -59,8 +59,8 @@ ; RV32: # %bb.0: ; RV32-NEXT: mv a4, a1 ; RV32-NEXT: sltu a1, a0, a2 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: sub a1, a4, a1 +; RV32-NEXT: sub a5, a4, a3 +; RV32-NEXT: sub a1, a5, a1 ; RV32-NEXT: xor a5, a4, a1 ; RV32-NEXT: xor a3, a4, a3 ; RV32-NEXT: and a3, a3, a5 diff --git a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll --- a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll +++ b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll @@ -65,8 +65,8 @@ ; RV32: # %bb.0: ; RV32-NEXT: mv a2, a1 ; RV32-NEXT: sltu a1, a0, a4 -; RV32-NEXT: add a1, a5, a1 -; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: sub a3, a2, a5 +; RV32-NEXT: sub a1, a3, a1 ; RV32-NEXT: xor a3, a2, a1 ; RV32-NEXT: xor a2, a2, a5 ; RV32-NEXT: and a2, a2, a3 diff --git a/llvm/test/CodeGen/RISCV/uadd_sat.ll b/llvm/test/CodeGen/RISCV/uadd_sat.ll --- a/llvm/test/CodeGen/RISCV/uadd_sat.ll +++ b/llvm/test/CodeGen/RISCV/uadd_sat.ll @@ -47,10 +47,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; RV32I-LABEL: func2: ; RV32I: # %bb.0: +; RV32I-NEXT: add a3, a1, a3 ; RV32I-NEXT: add a2, a0, a2 ; RV32I-NEXT: sltu a0, a2, a0 ; RV32I-NEXT: add a3, a3, a0 -; RV32I-NEXT: add a3, a1, a3 ; RV32I-NEXT: beq a3, a1, .LBB1_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sltu a0, a3, a1 @@ -70,10 +70,10 @@ ; ; RV32IZbb-LABEL: func2: ; RV32IZbb: # %bb.0: +; RV32IZbb-NEXT: add a3, a1, a3 ; RV32IZbb-NEXT: add a2, a0, a2 ; RV32IZbb-NEXT: sltu a0, a2, a0 ; RV32IZbb-NEXT: add a3, a3, a0 -; RV32IZbb-NEXT: add a3, a1, a3 ; RV32IZbb-NEXT: beq a3, a1, .LBB1_2 ; RV32IZbb-NEXT: # %bb.1: ; RV32IZbb-NEXT: sltu a0, a3, a1 diff --git a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll --- a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll +++ b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll @@ -54,10 +54,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind { ; RV32I-LABEL: func64: ; RV32I: # %bb.0: +; RV32I-NEXT: add a2, a1, a5 ; RV32I-NEXT: add a4, a0, a4 ; RV32I-NEXT: sltu a0, a4, a0 -; RV32I-NEXT: add a2, a5, a0 -; RV32I-NEXT: add a2, a1, a2 +; RV32I-NEXT: add a2, a2, a0 ; RV32I-NEXT: beq a2, a1, .LBB1_2 ; RV32I-NEXT: # %bb.1: ; RV32I-NEXT: sltu a0, a2, a1 @@ -77,10 +77,10 @@ ; ; RV32IZbb-LABEL: func64: ; RV32IZbb: # %bb.0: +; RV32IZbb-NEXT: add a2, a1, a5 ; RV32IZbb-NEXT: add a4, a0, a4 ; RV32IZbb-NEXT: sltu a0, a4, a0 -; RV32IZbb-NEXT: add a2, a5, a0 -; RV32IZbb-NEXT: add a2, a1, a2 +; RV32IZbb-NEXT: add a2, a2, a0 ; RV32IZbb-NEXT: beq a2, a1, .LBB1_2 ; RV32IZbb-NEXT: # %bb.1: ; RV32IZbb-NEXT: sltu a0, a2, a1 diff --git a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/RISCV/umulo-128-legalisation-lowering.ll @@ -10,19 +10,19 @@ ; RISCV32-NEXT: sw s2, 20(sp) # 4-byte Folded Spill ; RISCV32-NEXT: sw s3, 16(sp) # 4-byte Folded Spill ; RISCV32-NEXT: sw s4, 12(sp) # 4-byte Folded Spill -; RISCV32-NEXT: lw a4, 12(a1) +; RISCV32-NEXT: lw a3, 12(a1) ; RISCV32-NEXT: lw a7, 12(a2) ; RISCV32-NEXT: lw a6, 8(a1) -; RISCV32-NEXT: lw a3, 0(a2) +; RISCV32-NEXT: lw a4, 0(a2) ; RISCV32-NEXT: lw a5, 0(a1) ; RISCV32-NEXT: lw t3, 4(a1) ; RISCV32-NEXT: lw t0, 8(a2) ; RISCV32-NEXT: lw a2, 4(a2) -; RISCV32-NEXT: mulhu a1, a5, a3 -; RISCV32-NEXT: mul t1, t3, a3 +; RISCV32-NEXT: mulhu a1, a5, a4 +; RISCV32-NEXT: mul t1, t3, a4 ; RISCV32-NEXT: add a1, t1, a1 ; RISCV32-NEXT: sltu t1, a1, t1 -; RISCV32-NEXT: mulhu t2, t3, a3 +; RISCV32-NEXT: mulhu t2, t3, a4 ; RISCV32-NEXT: add t4, t2, t1 ; RISCV32-NEXT: mul t1, a5, a2 ; RISCV32-NEXT: add a1, t1, a1 @@ -33,65 +33,65 @@ ; RISCV32-NEXT: mul t6, t3, a2 ; RISCV32-NEXT: add s0, t6, t5 ; RISCV32-NEXT: mul t1, t0, a5 -; RISCV32-NEXT: mul s3, a6, a3 +; RISCV32-NEXT: mul s3, a6, a4 ; RISCV32-NEXT: add s4, s3, t1 ; RISCV32-NEXT: add t1, s0, s4 ; RISCV32-NEXT: sltu t2, t1, s0 -; RISCV32-NEXT: sltu t6, s0, t6 +; RISCV32-NEXT: sltu s0, s0, t6 ; RISCV32-NEXT: sltu t4, t5, t4 -; RISCV32-NEXT: mulhu s1, t3, a2 -; RISCV32-NEXT: add t4, t4, t6 -; RISCV32-NEXT: add s1, s1, t4 +; RISCV32-NEXT: mulhu t5, t3, a2 +; RISCV32-NEXT: add t4, t5, t4 +; RISCV32-NEXT: add s0, t4, s0 ; RISCV32-NEXT: mul t4, t3, t0 -; RISCV32-NEXT: mul s2, a7, a5 -; RISCV32-NEXT: mulhu s0, t0, a5 -; RISCV32-NEXT: add t4, s0, t4 -; RISCV32-NEXT: add s2, t4, s2 +; RISCV32-NEXT: mul t5, a7, a5 +; RISCV32-NEXT: add t4, t5, t4 +; RISCV32-NEXT: mulhu s1, t0, a5 +; RISCV32-NEXT: add s2, s1, t4 ; RISCV32-NEXT: mul t4, a2, a6 -; RISCV32-NEXT: mul t6, a4, a3 -; RISCV32-NEXT: mulhu t5, a6, a3 +; RISCV32-NEXT: mul t5, a3, a4 ; RISCV32-NEXT: add t4, t5, t4 -; RISCV32-NEXT: add t6, t4, t6 -; RISCV32-NEXT: sltu t4, s4, s3 -; RISCV32-NEXT: add t4, s2, t4 -; RISCV32-NEXT: add t4, t6, t4 +; RISCV32-NEXT: mulhu t5, a6, a4 +; RISCV32-NEXT: add t6, t5, t4 +; RISCV32-NEXT: add t4, t6, s2 +; RISCV32-NEXT: sltu s3, s4, s3 +; RISCV32-NEXT: add t4, t4, s3 +; RISCV32-NEXT: add t4, s0, t4 ; RISCV32-NEXT: add t4, t4, t2 -; RISCV32-NEXT: add t4, s1, t4 -; RISCV32-NEXT: beq t4, s1, .LBB0_2 +; RISCV32-NEXT: beq t4, s0, .LBB0_2 ; RISCV32-NEXT: # %bb.1: # %start -; RISCV32-NEXT: sltu t2, t4, s1 +; RISCV32-NEXT: sltu t2, t4, s0 ; RISCV32-NEXT: .LBB0_2: # %start -; RISCV32-NEXT: sltu s0, s2, s0 +; RISCV32-NEXT: sltu s0, s2, s1 ; RISCV32-NEXT: snez s1, t3 ; RISCV32-NEXT: snez s2, a7 ; RISCV32-NEXT: and s1, s2, s1 ; RISCV32-NEXT: mulhu s2, a7, a5 ; RISCV32-NEXT: snez s2, s2 +; RISCV32-NEXT: or s1, s1, s2 ; RISCV32-NEXT: mulhu t3, t3, t0 ; RISCV32-NEXT: snez t3, t3 -; RISCV32-NEXT: or t3, s2, t3 -; RISCV32-NEXT: or t3, t3, s0 ; RISCV32-NEXT: or t3, s1, t3 +; RISCV32-NEXT: or t3, t3, s0 ; RISCV32-NEXT: sltu t5, t6, t5 ; RISCV32-NEXT: snez t6, a2 -; RISCV32-NEXT: snez s0, a4 +; RISCV32-NEXT: snez s0, a3 ; RISCV32-NEXT: and t6, s0, t6 -; RISCV32-NEXT: mulhu s0, a4, a3 +; RISCV32-NEXT: mulhu s0, a3, a4 ; RISCV32-NEXT: snez s0, s0 +; RISCV32-NEXT: or t6, t6, s0 ; RISCV32-NEXT: mulhu a2, a2, a6 ; RISCV32-NEXT: snez a2, a2 -; RISCV32-NEXT: or a2, s0, a2 +; RISCV32-NEXT: or a2, t6, a2 ; RISCV32-NEXT: or a2, a2, t5 ; RISCV32-NEXT: or a7, t0, a7 ; RISCV32-NEXT: snez a7, a7 -; RISCV32-NEXT: or a4, a6, a4 -; RISCV32-NEXT: snez a4, a4 -; RISCV32-NEXT: and a4, a4, a7 -; RISCV32-NEXT: or a2, a4, a2 -; RISCV32-NEXT: or a4, t6, t3 -; RISCV32-NEXT: or a4, a4, t2 -; RISCV32-NEXT: or a2, a2, a4 -; RISCV32-NEXT: mul a3, a5, a3 +; RISCV32-NEXT: or a3, a6, a3 +; RISCV32-NEXT: snez a3, a3 +; RISCV32-NEXT: and a3, a3, a7 +; RISCV32-NEXT: or a2, a3, a2 +; RISCV32-NEXT: or a3, t3, t2 +; RISCV32-NEXT: or a2, a2, a3 +; RISCV32-NEXT: mul a3, a5, a4 ; RISCV32-NEXT: andi a2, a2, 1 ; RISCV32-NEXT: sw a3, 0(a0) ; RISCV32-NEXT: sw a1, 4(a0) diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll --- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll +++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll @@ -40,11 +40,11 @@ ; NOMISALIGN-LABEL: load_i24: ; NOMISALIGN: # %bb.0: ; NOMISALIGN-NEXT: lbu a1, 1(a0) -; NOMISALIGN-NEXT: lb a2, 2(a0) -; NOMISALIGN-NEXT: lbu a0, 0(a0) +; NOMISALIGN-NEXT: lbu a2, 0(a0) +; NOMISALIGN-NEXT: lb a0, 2(a0) ; NOMISALIGN-NEXT: slli a1, a1, 8 -; NOMISALIGN-NEXT: slli a2, a2, 16 -; NOMISALIGN-NEXT: or a0, a0, a2 +; NOMISALIGN-NEXT: or a1, a1, a2 +; NOMISALIGN-NEXT: slli a0, a0, 16 ; NOMISALIGN-NEXT: or a0, a1, a0 ; NOMISALIGN-NEXT: ret ; @@ -70,7 +70,7 @@ ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a1, a3, a1 +; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: ret ; @@ -84,7 +84,7 @@ ; RV64I-NEXT: or a1, a1, a2 ; RV64I-NEXT: slli a3, a3, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; @@ -107,8 +107,8 @@ ; RV32I-NEXT: or a1, a1, a2 ; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a1, a3, a1 -; RV32I-NEXT: or a2, a4, a1 +; RV32I-NEXT: or a2, a4, a3 +; RV32I-NEXT: or a2, a2, a1 ; RV32I-NEXT: lbu a1, 5(a0) ; RV32I-NEXT: lbu a3, 4(a0) ; RV32I-NEXT: lbu a4, 6(a0) @@ -117,7 +117,7 @@ ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: or a1, a0, a1 ; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: ret @@ -127,25 +127,25 @@ ; RV64I-NEXT: lbu a1, 1(a0) ; RV64I-NEXT: lbu a2, 0(a0) ; RV64I-NEXT: lbu a3, 2(a0) +; RV64I-NEXT: lbu a4, 3(a0) ; RV64I-NEXT: slli a1, a1, 8 ; RV64I-NEXT: or a1, a1, a2 ; RV64I-NEXT: slli a3, a3, 16 -; RV64I-NEXT: lbu a2, 5(a0) -; RV64I-NEXT: lbu a4, 3(a0) +; RV64I-NEXT: slli a4, a4, 24 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a1, a3, a1 +; RV64I-NEXT: lbu a2, 5(a0) ; RV64I-NEXT: lbu a3, 4(a0) -; RV64I-NEXT: slli a2, a2, 8 -; RV64I-NEXT: lbu a5, 6(a0) +; RV64I-NEXT: lbu a4, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a2, a2, 8 ; RV64I-NEXT: or a2, a2, a3 -; RV64I-NEXT: slli a4, a4, 24 -; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a2, a5, a2 +; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: or a0, a0, a2 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a1 -; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: ret ; ; MISALIGN-RV32I-LABEL: load_i64: diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll --- a/llvm/test/CodeGen/RISCV/urem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -140,7 +140,7 @@ ; RV32IM-NEXT: srli a1, a1, 6 ; RV32IM-NEXT: li a2, 95 ; RV32IM-NEXT: mul a2, a1, a2 -; RV32IM-NEXT: sub a2, a2, a1 +; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: sub a0, a0, a2 ; RV32IM-NEXT: ret ; @@ -180,7 +180,7 @@ ; RV64IM-NEXT: srli a1, a1, 6 ; RV64IM-NEXT: li a2, 95 ; RV64IM-NEXT: mulw a2, a1, a2 -; RV64IM-NEXT: subw a2, a2, a1 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: subw a0, a0, a2 ; RV64IM-NEXT: ret %1 = urem i32 %x, 95 diff --git a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/RISCV/urem-seteq-illegal-types.ll @@ -579,8 +579,8 @@ ; RV32MV-NEXT: andi a3, a3, 2047 ; RV32MV-NEXT: slli a3, a3, 11 ; RV32MV-NEXT: slli a1, a1, 22 -; RV32MV-NEXT: or a1, a3, a1 ; RV32MV-NEXT: or a1, a2, a1 +; RV32MV-NEXT: or a1, a1, a3 ; RV32MV-NEXT: sw a1, 0(a0) ; RV32MV-NEXT: addi sp, sp, 16 ; RV32MV-NEXT: ret @@ -641,7 +641,7 @@ ; RV64MV-NEXT: vslidedown.vi v8, v8, 2 ; RV64MV-NEXT: vmv.x.s a3, v8 ; RV64MV-NEXT: slli a3, a3, 22 -; RV64MV-NEXT: or a2, a2, a3 +; RV64MV-NEXT: or a1, a1, a3 ; RV64MV-NEXT: or a1, a1, a2 ; RV64MV-NEXT: sw a1, 0(a0) ; RV64MV-NEXT: slli a1, a1, 31 diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -401,14 +401,14 @@ ; RV32IM-NEXT: mul t4, t3, a7 ; RV32IM-NEXT: mulhu a5, a2, a5 ; RV32IM-NEXT: mul a7, a5, a7 -; RV32IM-NEXT: sub a5, a7, a5 -; RV32IM-NEXT: sub a2, a2, a5 -; RV32IM-NEXT: sub a5, t4, t3 -; RV32IM-NEXT: sub a3, a3, a5 -; RV32IM-NEXT: sub a5, t2, t1 -; RV32IM-NEXT: sub a1, a1, a5 -; RV32IM-NEXT: sub a5, t0, a6 -; RV32IM-NEXT: sub a4, a4, a5 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: sub a2, a2, a7 +; RV32IM-NEXT: add a3, a3, t3 +; RV32IM-NEXT: sub a3, a3, t4 +; RV32IM-NEXT: add a1, a1, t1 +; RV32IM-NEXT: sub a1, a1, t2 +; RV32IM-NEXT: add a4, a4, a6 +; RV32IM-NEXT: sub a4, a4, t0 ; RV32IM-NEXT: sh a4, 6(a0) ; RV32IM-NEXT: sh a1, 4(a0) ; RV32IM-NEXT: sh a3, 2(a0) @@ -502,18 +502,18 @@ ; RV64IM-NEXT: mulw t4, t3, a7 ; RV64IM-NEXT: mulhu a3, a4, a3 ; RV64IM-NEXT: mulw a7, a3, a7 -; RV64IM-NEXT: subw a3, a7, a3 -; RV64IM-NEXT: subw a4, a4, a3 -; RV64IM-NEXT: subw a3, t4, t3 -; RV64IM-NEXT: subw a5, a5, a3 -; RV64IM-NEXT: subw a3, t2, t1 -; RV64IM-NEXT: subw a1, a1, a3 -; RV64IM-NEXT: subw a3, t0, a6 -; RV64IM-NEXT: subw a2, a2, a3 +; RV64IM-NEXT: add a3, a4, a3 +; RV64IM-NEXT: subw a3, a3, a7 +; RV64IM-NEXT: add a5, a5, t3 +; RV64IM-NEXT: subw a4, a5, t4 +; RV64IM-NEXT: add a1, a1, t1 +; RV64IM-NEXT: subw a1, a1, t2 +; RV64IM-NEXT: add a2, a2, a6 +; RV64IM-NEXT: subw a2, a2, t0 ; RV64IM-NEXT: sh a2, 6(a0) ; RV64IM-NEXT: sh a1, 4(a0) -; RV64IM-NEXT: sh a5, 2(a0) -; RV64IM-NEXT: sh a4, 0(a0) +; RV64IM-NEXT: sh a4, 2(a0) +; RV64IM-NEXT: sh a3, 0(a0) ; RV64IM-NEXT: ret %1 = urem <4 x i16> %x, %2 = udiv <4 x i16> %x, diff --git a/llvm/test/CodeGen/RISCV/usub_sat.ll b/llvm/test/CodeGen/RISCV/usub_sat.ll --- a/llvm/test/CodeGen/RISCV/usub_sat.ll +++ b/llvm/test/CodeGen/RISCV/usub_sat.ll @@ -46,8 +46,8 @@ ; RV32I-LABEL: func2: ; RV32I: # %bb.0: ; RV32I-NEXT: sltu a4, a0, a2 -; RV32I-NEXT: add a3, a3, a4 ; RV32I-NEXT: sub a3, a1, a3 +; RV32I-NEXT: sub a3, a3, a4 ; RV32I-NEXT: sub a2, a0, a2 ; RV32I-NEXT: beq a3, a1, .LBB1_2 ; RV32I-NEXT: # %bb.1: @@ -72,8 +72,8 @@ ; RV32IZbb-LABEL: func2: ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: sltu a4, a0, a2 -; RV32IZbb-NEXT: add a3, a3, a4 ; RV32IZbb-NEXT: sub a3, a1, a3 +; RV32IZbb-NEXT: sub a3, a3, a4 ; RV32IZbb-NEXT: sub a2, a0, a2 ; RV32IZbb-NEXT: beq a3, a1, .LBB1_2 ; RV32IZbb-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll --- a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll +++ b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll @@ -53,8 +53,8 @@ ; RV32I-LABEL: func64: ; RV32I: # %bb.0: ; RV32I-NEXT: sltu a2, a0, a4 -; RV32I-NEXT: add a2, a5, a2 -; RV32I-NEXT: sub a2, a1, a2 +; RV32I-NEXT: sub a3, a1, a5 +; RV32I-NEXT: sub a2, a3, a2 ; RV32I-NEXT: sub a3, a0, a4 ; RV32I-NEXT: beq a2, a1, .LBB1_2 ; RV32I-NEXT: # %bb.1: @@ -79,8 +79,8 @@ ; RV32IZbb-LABEL: func64: ; RV32IZbb: # %bb.0: ; RV32IZbb-NEXT: sltu a2, a0, a4 -; RV32IZbb-NEXT: add a2, a5, a2 -; RV32IZbb-NEXT: sub a2, a1, a2 +; RV32IZbb-NEXT: sub a3, a1, a5 +; RV32IZbb-NEXT: sub a2, a3, a2 ; RV32IZbb-NEXT: sub a3, a0, a4 ; RV32IZbb-NEXT: beq a2, a1, .LBB1_2 ; RV32IZbb-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll --- a/llvm/test/CodeGen/RISCV/vararg.ll +++ b/llvm/test/CodeGen/RISCV/vararg.ll @@ -808,11 +808,11 @@ ; ILP32-ILP32F-FPELIM-NEXT: andi a0, a0, -8 ; ILP32-ILP32F-FPELIM-NEXT: addi a3, sp, 27 ; ILP32-ILP32F-FPELIM-NEXT: sw a3, 4(sp) -; ILP32-ILP32F-FPELIM-NEXT: lw a3, 0(a0) -; ILP32-ILP32F-FPELIM-NEXT: lw a4, 4(a0) -; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a3 +; ILP32-ILP32F-FPELIM-NEXT: lw a3, 4(a0) +; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a0) +; ILP32-ILP32F-FPELIM-NEXT: add a2, a2, a3 +; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a0 ; ILP32-ILP32F-FPELIM-NEXT: sltu a1, a0, a1 -; ILP32-ILP32F-FPELIM-NEXT: add a1, a4, a1 ; ILP32-ILP32F-FPELIM-NEXT: add a1, a2, a1 ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 32 ; ILP32-ILP32F-FPELIM-NEXT: ret @@ -832,11 +832,11 @@ ; ILP32-ILP32F-WITHFP-NEXT: andi a0, a0, -8 ; ILP32-ILP32F-WITHFP-NEXT: addi a3, s0, 19 ; ILP32-ILP32F-WITHFP-NEXT: sw a3, -12(s0) -; ILP32-ILP32F-WITHFP-NEXT: lw a3, 0(a0) -; ILP32-ILP32F-WITHFP-NEXT: lw a4, 4(a0) -; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a3 +; ILP32-ILP32F-WITHFP-NEXT: lw a3, 4(a0) +; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a0) +; ILP32-ILP32F-WITHFP-NEXT: add a2, a2, a3 +; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a0 ; ILP32-ILP32F-WITHFP-NEXT: sltu a1, a0, a1 -; ILP32-ILP32F-WITHFP-NEXT: add a1, a4, a1 ; ILP32-ILP32F-WITHFP-NEXT: add a1, a2, a1 ; ILP32-ILP32F-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload ; ILP32-ILP32F-WITHFP-NEXT: lw s0, 16(sp) # 4-byte Folded Reload @@ -855,11 +855,11 @@ ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: andi a0, a0, -8 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a3, sp, 27 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 4(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 0(a0) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a4, 4(a0) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a3 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 4(a0) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 0(a0) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, a2, a3 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a0 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sltu a1, a0, a1 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a4, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a2, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 32 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: ret @@ -951,7 +951,7 @@ ; ILP32-ILP32F-FPELIM-NEXT: lw a4, 4(a0) ; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a3 ; ILP32-ILP32F-FPELIM-NEXT: sltu a1, a0, a1 -; ILP32-ILP32F-FPELIM-NEXT: add a1, a4, a1 +; ILP32-ILP32F-FPELIM-NEXT: add a2, a2, a4 ; ILP32-ILP32F-FPELIM-NEXT: add a1, a2, a1 ; ILP32-ILP32F-FPELIM-NEXT: addi sp, sp, 32 ; ILP32-ILP32F-FPELIM-NEXT: ret @@ -977,7 +977,7 @@ ; ILP32-ILP32F-WITHFP-NEXT: lw a4, 4(a0) ; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a3 ; ILP32-ILP32F-WITHFP-NEXT: sltu a1, a0, a1 -; ILP32-ILP32F-WITHFP-NEXT: add a1, a4, a1 +; ILP32-ILP32F-WITHFP-NEXT: add a2, a2, a4 ; ILP32-ILP32F-WITHFP-NEXT: add a1, a2, a1 ; ILP32-ILP32F-WITHFP-NEXT: lw ra, 20(sp) # 4-byte Folded Reload ; ILP32-ILP32F-WITHFP-NEXT: lw s0, 16(sp) # 4-byte Folded Reload @@ -998,11 +998,11 @@ ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 20(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: fld ft0, 0(a0) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: fsd ft0, 8(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 8(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 12(sp) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a0 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 12(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a3, 8(sp) +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, a2, a0 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a3 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sltu a1, a0, a1 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a3, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a2, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi sp, sp, 48 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: ret @@ -1164,8 +1164,8 @@ ; ILP32-ILP32F-FPELIM-NEXT: addi a3, a0, 4 ; ILP32-ILP32F-FPELIM-NEXT: sw a3, 4(sp) ; ILP32-ILP32F-FPELIM-NEXT: lw a0, 0(a0) -; ILP32-ILP32F-FPELIM-NEXT: add a2, s0, a2 -; ILP32-ILP32F-FPELIM-NEXT: add a0, a2, a0 +; ILP32-ILP32F-FPELIM-NEXT: add a1, a1, s0 +; ILP32-ILP32F-FPELIM-NEXT: add a1, a1, a2 ; ILP32-ILP32F-FPELIM-NEXT: add a0, a1, a0 ; ILP32-ILP32F-FPELIM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; ILP32-ILP32F-FPELIM-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1207,8 +1207,8 @@ ; ILP32-ILP32F-WITHFP-NEXT: addi a3, a0, 4 ; ILP32-ILP32F-WITHFP-NEXT: sw a3, -16(s0) ; ILP32-ILP32F-WITHFP-NEXT: lw a0, 0(a0) -; ILP32-ILP32F-WITHFP-NEXT: add a2, s1, a2 -; ILP32-ILP32F-WITHFP-NEXT: add a0, a2, a0 +; ILP32-ILP32F-WITHFP-NEXT: add a1, a1, s1 +; ILP32-ILP32F-WITHFP-NEXT: add a1, a1, a2 ; ILP32-ILP32F-WITHFP-NEXT: add a0, a1, a0 ; ILP32-ILP32F-WITHFP-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; ILP32-ILP32F-WITHFP-NEXT: lw s0, 24(sp) # 4-byte Folded Reload @@ -1249,8 +1249,8 @@ ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a3, a0, 4 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a3, 4(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw a0, 0(a0) -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a2, s0, a2 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a2, a0 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a1, s0 +; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, a1, a2 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a0, a1, a0 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -1290,8 +1290,8 @@ ; LP64-LP64F-LP64D-FPELIM-NEXT: addi a3, a0, 8 ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a3, 8(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: ld a0, 0(a0) -; LP64-LP64F-LP64D-FPELIM-NEXT: add a2, s0, a2 -; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, a2, a0 +; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, a1, s0 +; LP64-LP64F-LP64D-FPELIM-NEXT: add a1, a1, a2 ; LP64-LP64F-LP64D-FPELIM-NEXT: addw a0, a1, a0 ; LP64-LP64F-LP64D-FPELIM-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-FPELIM-NEXT: ld s0, 16(sp) # 8-byte Folded Reload @@ -1333,8 +1333,8 @@ ; LP64-LP64F-LP64D-WITHFP-NEXT: addi a3, a0, 8 ; LP64-LP64F-LP64D-WITHFP-NEXT: sd a3, -32(s0) ; LP64-LP64F-LP64D-WITHFP-NEXT: ld a0, 0(a0) -; LP64-LP64F-LP64D-WITHFP-NEXT: add a2, s1, a2 -; LP64-LP64F-LP64D-WITHFP-NEXT: add a0, a2, a0 +; LP64-LP64F-LP64D-WITHFP-NEXT: add a1, a1, s1 +; LP64-LP64F-LP64D-WITHFP-NEXT: add a1, a1, a2 ; LP64-LP64F-LP64D-WITHFP-NEXT: addw a0, a1, a0 ; LP64-LP64F-LP64D-WITHFP-NEXT: ld ra, 40(sp) # 8-byte Folded Reload ; LP64-LP64F-LP64D-WITHFP-NEXT: ld s0, 32(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -14,7 +14,7 @@ ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: srlw a0, a0, a1 @@ -37,7 +37,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) @@ -47,7 +47,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: srl a0, a0, a1 @@ -78,7 +78,7 @@ ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: sllw a0, a0, a1 @@ -101,7 +101,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) @@ -111,7 +111,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sll a0, a0, a1 @@ -142,7 +142,7 @@ ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: sraw a0, a0, a1 @@ -165,7 +165,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) @@ -175,7 +175,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: sra a0, a0, a1 @@ -201,25 +201,25 @@ ; RV64I-NEXT: lbu a3, 1(a0) ; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu a6, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: lbu a3, 5(a1) ; RV64I-NEXT: lbu a4, 4(a1) ; RV64I-NEXT: lbu a5, 6(a1) @@ -228,8 +228,8 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 1(a1) ; RV64I-NEXT: lbu a5, 0(a1) ; RV64I-NEXT: lbu a6, 2(a1) @@ -238,7 +238,7 @@ ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a3, a3, 35 @@ -271,8 +271,8 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a5, 0(a1) ; RV32I-NEXT: lbu a6, 2(a1) @@ -281,7 +281,7 @@ ; RV32I-NEXT: or a4, a4, a5 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a5, a1, 3 ; RV32I-NEXT: addi a4, a5, -32 @@ -299,7 +299,7 @@ ; RV32I-NEXT: or a6, a6, a7 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a6, t0, a6 +; RV32I-NEXT: or a0, a0, t0 ; RV32I-NEXT: or a0, a0, a6 ; RV32I-NEXT: srl a0, a0, a5 ; RV32I-NEXT: slli a3, a3, 1 @@ -338,25 +338,25 @@ ; RV64I-NEXT: lbu a3, 1(a0) ; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu a6, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: lbu a3, 5(a1) ; RV64I-NEXT: lbu a4, 4(a1) ; RV64I-NEXT: lbu a5, 6(a1) @@ -365,8 +365,8 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 1(a1) ; RV64I-NEXT: lbu a5, 0(a1) ; RV64I-NEXT: lbu a6, 2(a1) @@ -375,7 +375,7 @@ ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a3, a3, 35 @@ -408,8 +408,8 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a5, 0(a1) ; RV32I-NEXT: lbu a6, 2(a1) @@ -418,7 +418,7 @@ ; RV32I-NEXT: or a4, a4, a5 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a4, a6, a4 +; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: or a1, a1, a4 ; RV32I-NEXT: slli a5, a1, 3 ; RV32I-NEXT: addi a4, a5, -32 @@ -436,7 +436,7 @@ ; RV32I-NEXT: or a6, a6, a7 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a6, t0, a6 +; RV32I-NEXT: or a0, a0, t0 ; RV32I-NEXT: or a0, a0, a6 ; RV32I-NEXT: sll a0, a0, a5 ; RV32I-NEXT: srli a3, a3, 1 @@ -475,25 +475,25 @@ ; RV64I-NEXT: lbu a3, 1(a0) ; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu a6, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a3 -; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: lbu a3, 5(a1) ; RV64I-NEXT: lbu a4, 4(a1) ; RV64I-NEXT: lbu a5, 6(a1) @@ -502,8 +502,8 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: or a3, a6, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 1(a1) ; RV64I-NEXT: lbu a5, 0(a1) ; RV64I-NEXT: lbu a6, 2(a1) @@ -512,7 +512,7 @@ ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a4, a6, a4 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a3, a3, 35 @@ -540,46 +540,46 @@ ; RV32I-NEXT: lbu a3, 5(a0) ; RV32I-NEXT: lbu a4, 4(a0) ; RV32I-NEXT: lbu a5, 6(a0) +; RV32I-NEXT: lbu a6, 7(a0) ; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: lbu a4, 1(a1) -; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: slli a4, a6, 24 +; RV32I-NEXT: or a5, a4, a5 ; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: lbu a5, 7(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a6 -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: or a1, a1, a4 -; RV32I-NEXT: slli a4, a1, 3 -; RV32I-NEXT: addi a6, a4, -32 -; RV32I-NEXT: sra a1, a3, a4 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a1, a1, a5 +; RV32I-NEXT: slli a5, a1, 3 +; RV32I-NEXT: addi a6, a5, -32 +; RV32I-NEXT: sra a1, a3, a5 ; RV32I-NEXT: bltz a6, .LBB5_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a5, a5, 31 +; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: j .LBB5_3 ; RV32I-NEXT: .LBB5_2: -; RV32I-NEXT: lbu a5, 1(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a6, 0(a0) ; RV32I-NEXT: lbu a7, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: srl a0, a0, a4 +; RV32I-NEXT: or a0, a0, a7 +; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: srl a0, a0, a5 ; RV32I-NEXT: slli a3, a3, 1 -; RV32I-NEXT: not a4, a4 +; RV32I-NEXT: not a4, a5 ; RV32I-NEXT: sll a3, a3, a4 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: .LBB5_3: @@ -612,25 +612,25 @@ ; RV64I-NEXT: lbu a3, 9(a0) ; RV64I-NEXT: lbu a4, 8(a0) ; RV64I-NEXT: lbu a5, 10(a0) +; RV64I-NEXT: lbu a6, 11(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 13(a0) -; RV64I-NEXT: lbu a6, 11(a0) -; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: lbu a5, 12(a0) +; RV64I-NEXT: lbu a6, 14(a0) +; RV64I-NEXT: lbu a7, 15(a0) ; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: lbu a7, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, t0, a4 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a3, a3, a6 ; RV64I-NEXT: lbu a4, 5(a1) ; RV64I-NEXT: lbu a5, 4(a1) ; RV64I-NEXT: lbu a6, 6(a1) @@ -639,8 +639,8 @@ ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a6, a4 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: lbu a5, 1(a1) ; RV64I-NEXT: lbu a6, 0(a1) ; RV64I-NEXT: lbu a7, 2(a1) @@ -649,7 +649,7 @@ ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a1, a1, a7 ; RV64I-NEXT: or a1, a1, a5 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a4, a4, 35 @@ -664,25 +664,25 @@ ; RV64I-NEXT: lbu a6, 1(a0) ; RV64I-NEXT: lbu a7, 0(a0) ; RV64I-NEXT: lbu t0, 2(a0) +; RV64I-NEXT: lbu t1, 3(a0) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: lbu a7, 5(a0) -; RV64I-NEXT: lbu t1, 3(a0) -; RV64I-NEXT: or a6, t0, a6 ; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: lbu t2, 6(a0) +; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: srl a0, a0, a5 ; RV64I-NEXT: not a5, a5 ; RV64I-NEXT: slli a3, a3, 1 @@ -832,25 +832,25 @@ ; RV64I-NEXT: lbu a3, 1(a0) ; RV64I-NEXT: lbu a4, 0(a0) ; RV64I-NEXT: lbu a5, 2(a0) +; RV64I-NEXT: lbu a6, 3(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: lbu a5, 4(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) ; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu t0, 7(a0) ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, t0, a4 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a3, a3, a6 ; RV64I-NEXT: lbu a4, 5(a1) ; RV64I-NEXT: lbu a5, 4(a1) ; RV64I-NEXT: lbu a6, 6(a1) @@ -859,8 +859,8 @@ ; RV64I-NEXT: or a4, a4, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a6, a4 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: lbu a5, 1(a1) ; RV64I-NEXT: lbu a6, 0(a1) ; RV64I-NEXT: lbu a7, 2(a1) @@ -869,7 +869,7 @@ ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a5, a7, a5 +; RV64I-NEXT: or a1, a1, a7 ; RV64I-NEXT: or a1, a1, a5 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a4, a4, 35 @@ -884,25 +884,25 @@ ; RV64I-NEXT: lbu a6, 9(a0) ; RV64I-NEXT: lbu a7, 8(a0) ; RV64I-NEXT: lbu t0, 10(a0) +; RV64I-NEXT: lbu t1, 11(a0) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: lbu t1, 11(a0) -; RV64I-NEXT: or a6, t0, a6 ; RV64I-NEXT: lbu t0, 12(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: lbu t2, 14(a0) +; RV64I-NEXT: lbu t1, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: sll a0, a0, a5 ; RV64I-NEXT: not a5, a5 ; RV64I-NEXT: srli a3, a3, 1 @@ -1057,20 +1057,20 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 13(a0) ; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a7, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: lbu a6, 14(a0) +; RV64I-NEXT: lbu a7, 15(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, t0, a4 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a5, a4, 32 ; RV64I-NEXT: or a3, a5, a3 -; RV64I-NEXT: or a3, a3, a6 ; RV64I-NEXT: lbu a5, 5(a1) ; RV64I-NEXT: lbu a6, 4(a1) ; RV64I-NEXT: lbu a7, 6(a1) @@ -1079,8 +1079,8 @@ ; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a5, a7, a5 -; RV64I-NEXT: or a5, t0, a5 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: lbu a6, 1(a1) ; RV64I-NEXT: lbu a7, 0(a1) ; RV64I-NEXT: lbu t0, 2(a1) @@ -1089,7 +1089,7 @@ ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: slli a1, a1, 3 ; RV64I-NEXT: slli a5, a5, 35 @@ -1106,25 +1106,25 @@ ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a6, 0(a0) ; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a6 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a4, a6, a4 ; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: lbu t0, 3(a0) -; RV64I-NEXT: or a4, a7, a4 ; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t0, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a6, t1, a6 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: srl a0, a0, a5 ; RV64I-NEXT: not a4, a5 ; RV64I-NEXT: slli a3, a3, 1 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -14,7 +14,7 @@ ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: srlw a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) @@ -36,7 +36,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) @@ -46,7 +46,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: srl a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) @@ -75,7 +75,7 @@ ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: sllw a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) @@ -97,7 +97,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) @@ -107,7 +107,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: sll a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) @@ -136,7 +136,7 @@ ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: lbu a1, 0(a1) ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: sraw a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) @@ -158,7 +158,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a0, a0, a5 ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: lbu a3, 1(a1) ; RV32I-NEXT: lbu a4, 0(a1) @@ -168,7 +168,7 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: sra a0, a0, a1 ; RV32I-NEXT: sb a0, 0(a2) @@ -197,42 +197,42 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a0) ; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu a6, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: lbu a3, 1(a1) ; RV64I-NEXT: lbu a4, 0(a1) ; RV64I-NEXT: lbu a5, 2(a1) -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a6, 3(a1) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a1) -; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: lbu a7, 6(a1) +; RV64I-NEXT: lbu a6, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: srl a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: srli a1, a0, 48 @@ -261,8 +261,8 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a5, 0(a1) ; RV32I-NEXT: lbu a6, 2(a1) @@ -271,8 +271,8 @@ ; RV32I-NEXT: or a4, a4, a5 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a5, a6, a4 -; RV32I-NEXT: or a5, a1, a5 +; RV32I-NEXT: or a5, a1, a6 +; RV32I-NEXT: or a5, a5, a4 ; RV32I-NEXT: addi a4, a5, -32 ; RV32I-NEXT: srl a1, a3, a5 ; RV32I-NEXT: bltz a4, .LBB3_2 @@ -288,7 +288,7 @@ ; RV32I-NEXT: or a6, a6, a7 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a6, t0, a6 +; RV32I-NEXT: or a0, a0, t0 ; RV32I-NEXT: or a0, a0, a6 ; RV32I-NEXT: srl a0, a0, a5 ; RV32I-NEXT: not a5, a5 @@ -331,42 +331,42 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a0) ; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu a6, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: lbu a3, 1(a1) ; RV64I-NEXT: lbu a4, 0(a1) ; RV64I-NEXT: lbu a5, 2(a1) -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a6, 3(a1) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a1) -; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: lbu a7, 6(a1) +; RV64I-NEXT: lbu a6, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: sll a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: srli a1, a0, 48 @@ -395,8 +395,8 @@ ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: or a3, a6, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a3, a4, a3 ; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu a5, 0(a1) ; RV32I-NEXT: lbu a6, 2(a1) @@ -405,8 +405,8 @@ ; RV32I-NEXT: or a4, a4, a5 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a5, a6, a4 -; RV32I-NEXT: or a5, a1, a5 +; RV32I-NEXT: or a5, a1, a6 +; RV32I-NEXT: or a5, a5, a4 ; RV32I-NEXT: addi a4, a5, -32 ; RV32I-NEXT: sll a1, a3, a5 ; RV32I-NEXT: bltz a4, .LBB4_2 @@ -422,7 +422,7 @@ ; RV32I-NEXT: or a6, a6, a7 ; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a6, t0, a6 +; RV32I-NEXT: or a0, a0, t0 ; RV32I-NEXT: or a0, a0, a6 ; RV32I-NEXT: sll a0, a0, a5 ; RV32I-NEXT: not a5, a5 @@ -465,42 +465,42 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a0) ; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu a6, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: or a0, a0, a4 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a3 ; RV64I-NEXT: lbu a3, 1(a1) ; RV64I-NEXT: lbu a4, 0(a1) ; RV64I-NEXT: lbu a5, 2(a1) -; RV64I-NEXT: or a0, a0, a6 +; RV64I-NEXT: lbu a6, 3(a1) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a1) -; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: lbu a5, 4(a1) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: lbu a7, 6(a1) +; RV64I-NEXT: lbu a6, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a1, a1, 32 ; RV64I-NEXT: or a1, a1, a3 -; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: sra a0, a0, a1 ; RV64I-NEXT: sb a0, 0(a2) ; RV64I-NEXT: srli a1, a0, 48 @@ -524,44 +524,44 @@ ; RV32I-NEXT: lbu a3, 5(a0) ; RV32I-NEXT: lbu a4, 4(a0) ; RV32I-NEXT: lbu a5, 6(a0) +; RV32I-NEXT: lbu a6, 7(a0) ; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: or a3, a3, a4 ; RV32I-NEXT: slli a5, a5, 16 -; RV32I-NEXT: lbu a4, 1(a1) -; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: slli a4, a6, 24 +; RV32I-NEXT: or a5, a4, a5 ; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: lbu a5, 7(a0) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, a6 -; RV32I-NEXT: lbu a6, 2(a1) +; RV32I-NEXT: lbu a5, 1(a1) +; RV32I-NEXT: lbu a6, 0(a1) +; RV32I-NEXT: lbu a7, 2(a1) ; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: slli a6, a6, 16 +; RV32I-NEXT: slli a5, a5, 8 +; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: or a4, a1, a4 -; RV32I-NEXT: addi a6, a4, -32 -; RV32I-NEXT: sra a1, a3, a4 +; RV32I-NEXT: or a1, a1, a7 +; RV32I-NEXT: or a5, a1, a5 +; RV32I-NEXT: addi a6, a5, -32 +; RV32I-NEXT: sra a1, a3, a5 ; RV32I-NEXT: bltz a6, .LBB5_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a5, a5, 31 +; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a5 +; RV32I-NEXT: mv a1, a4 ; RV32I-NEXT: j .LBB5_3 ; RV32I-NEXT: .LBB5_2: -; RV32I-NEXT: lbu a5, 1(a0) +; RV32I-NEXT: lbu a4, 1(a0) ; RV32I-NEXT: lbu a6, 0(a0) ; RV32I-NEXT: lbu a7, 2(a0) ; RV32I-NEXT: lbu a0, 3(a0) -; RV32I-NEXT: slli a5, a5, 8 -; RV32I-NEXT: or a5, a5, a6 +; RV32I-NEXT: slli a4, a4, 8 +; RV32I-NEXT: or a4, a4, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or a5, a7, a5 -; RV32I-NEXT: or a0, a0, a5 -; RV32I-NEXT: srl a0, a0, a4 -; RV32I-NEXT: not a4, a4 +; RV32I-NEXT: or a0, a0, a7 +; RV32I-NEXT: or a0, a0, a4 +; RV32I-NEXT: srl a0, a0, a5 +; RV32I-NEXT: not a4, a5 ; RV32I-NEXT: slli a3, a3, 1 ; RV32I-NEXT: sll a3, a3, a4 ; RV32I-NEXT: or a0, a0, a3 @@ -599,42 +599,42 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 13(a0) ; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a7, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: lbu a6, 14(a0) +; RV64I-NEXT: lbu a7, 15(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, t0, a4 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 1(a1) ; RV64I-NEXT: lbu a5, 0(a1) -; RV64I-NEXT: lbu a7, 2(a1) -; RV64I-NEXT: or a3, a3, a6 +; RV64I-NEXT: lbu a6, 2(a1) +; RV64I-NEXT: lbu a7, 3(a1) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: lbu a5, 5(a1) -; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a7, 4(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu a7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or a5, a5, a7 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a5, t0, a5 +; RV64I-NEXT: or a1, a1, a7 ; RV64I-NEXT: or a1, a1, a5 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: or a5, a1, a6 +; RV64I-NEXT: or a5, a1, a4 ; RV64I-NEXT: addi a4, a5, -64 ; RV64I-NEXT: srl a1, a3, a5 ; RV64I-NEXT: bltz a4, .LBB6_2 @@ -645,25 +645,25 @@ ; RV64I-NEXT: lbu a6, 1(a0) ; RV64I-NEXT: lbu a7, 0(a0) ; RV64I-NEXT: lbu t0, 2(a0) +; RV64I-NEXT: lbu t1, 3(a0) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: lbu a7, 5(a0) -; RV64I-NEXT: lbu t1, 3(a0) -; RV64I-NEXT: or a6, t0, a6 ; RV64I-NEXT: lbu t0, 4(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: lbu t2, 6(a0) +; RV64I-NEXT: lbu t1, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: srl a0, a0, a5 ; RV64I-NEXT: not a5, a5 ; RV64I-NEXT: slli a3, a3, 1 @@ -737,7 +737,7 @@ ; RV32I-NEXT: lbu a0, 15(a0) ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or a1, a1, s1 ; RV32I-NEXT: or a1, a1, s0 ; RV32I-NEXT: sb zero, 43(sp) ; RV32I-NEXT: sb zero, 42(sp) @@ -783,8 +783,8 @@ ; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a0, a5, a0 -; RV32I-NEXT: or a4, a6, a0 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a4, a4, a0 ; RV32I-NEXT: andi a5, a1, 7 ; RV32I-NEXT: srl a0, a4, a5 ; RV32I-NEXT: lbu a1, 9(a3) @@ -795,8 +795,8 @@ ; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or a6, t0, a1 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: slli a1, a6, 1 ; RV32I-NEXT: not a7, a5 ; RV32I-NEXT: sll a1, a1, a7 @@ -809,8 +809,8 @@ ; RV32I-NEXT: or a7, a7, t0 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t1, a7 -; RV32I-NEXT: or a7, t2, a7 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 ; RV32I-NEXT: srl a7, a7, a5 ; RV32I-NEXT: slli a4, a4, 1 ; RV32I-NEXT: xori t0, a5, 31 @@ -825,7 +825,7 @@ ; RV32I-NEXT: or t1, t1, t2 ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or t1, t3, t1 +; RV32I-NEXT: or a3, a3, t3 ; RV32I-NEXT: or a3, a3, t1 ; RV32I-NEXT: slli t1, a3, 1 ; RV32I-NEXT: sll t0, t1, t0 @@ -883,42 +883,42 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 5(a0) ; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu t0, 7(a0) +; RV64I-NEXT: lbu a6, 6(a0) +; RV64I-NEXT: lbu a7, 7(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, t0, a4 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 1(a1) ; RV64I-NEXT: lbu a5, 0(a1) -; RV64I-NEXT: lbu a7, 2(a1) -; RV64I-NEXT: or a3, a3, a6 +; RV64I-NEXT: lbu a6, 2(a1) +; RV64I-NEXT: lbu a7, 3(a1) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: lbu a5, 5(a1) -; RV64I-NEXT: lbu a6, 3(a1) -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: lbu a7, 4(a1) -; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: lbu t0, 6(a1) +; RV64I-NEXT: lbu a6, 4(a1) +; RV64I-NEXT: lbu a7, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or a5, a5, a7 -; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a5, t0, a5 +; RV64I-NEXT: or a1, a1, a7 ; RV64I-NEXT: or a1, a1, a5 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: or a5, a1, a6 +; RV64I-NEXT: or a5, a1, a4 ; RV64I-NEXT: addi a4, a5, -64 ; RV64I-NEXT: sll a1, a3, a5 ; RV64I-NEXT: bltz a4, .LBB7_2 @@ -929,25 +929,25 @@ ; RV64I-NEXT: lbu a6, 9(a0) ; RV64I-NEXT: lbu a7, 8(a0) ; RV64I-NEXT: lbu t0, 10(a0) +; RV64I-NEXT: lbu t1, 11(a0) ; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: lbu a7, 13(a0) -; RV64I-NEXT: lbu t1, 11(a0) -; RV64I-NEXT: or a6, t0, a6 ; RV64I-NEXT: lbu t0, 12(a0) -; RV64I-NEXT: slli a7, a7, 8 -; RV64I-NEXT: lbu t2, 14(a0) +; RV64I-NEXT: lbu t1, 14(a0) ; RV64I-NEXT: lbu a0, 15(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a7, t2, a7 +; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: or a0, a0, a7 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a6 -; RV64I-NEXT: or a0, a0, t1 ; RV64I-NEXT: sll a0, a0, a5 ; RV64I-NEXT: not a5, a5 ; RV64I-NEXT: srli a3, a3, 1 @@ -1021,7 +1021,7 @@ ; RV32I-NEXT: lbu a0, 15(a0) ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: or a1, a1, s1 ; RV32I-NEXT: or a1, a1, s0 ; RV32I-NEXT: sb zero, 27(sp) ; RV32I-NEXT: sb zero, 26(sp) @@ -1067,8 +1067,8 @@ ; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a0, a5, a0 -; RV32I-NEXT: or a4, a6, a0 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a4, a4, a0 ; RV32I-NEXT: andi a5, a1, 7 ; RV32I-NEXT: sll a0, a4, a5 ; RV32I-NEXT: lbu a1, 1(a3) @@ -1079,8 +1079,8 @@ ; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or a6, t0, a1 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: srli a1, a6, 1 ; RV32I-NEXT: xori a7, a5, 31 ; RV32I-NEXT: srl a1, a1, a7 @@ -1093,8 +1093,8 @@ ; RV32I-NEXT: or t0, t0, t1 ; RV32I-NEXT: slli t2, t2, 16 ; RV32I-NEXT: slli t3, t3, 24 -; RV32I-NEXT: or t0, t2, t0 -; RV32I-NEXT: or t0, t3, t0 +; RV32I-NEXT: or t1, t3, t2 +; RV32I-NEXT: or t0, t1, t0 ; RV32I-NEXT: sll t0, t0, a5 ; RV32I-NEXT: lbu t1, 9(a3) ; RV32I-NEXT: lbu t2, 8(a3) @@ -1104,7 +1104,7 @@ ; RV32I-NEXT: or t1, t1, t2 ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or t1, t3, t1 +; RV32I-NEXT: or a3, a3, t3 ; RV32I-NEXT: or a3, a3, t1 ; RV32I-NEXT: srli t1, a3, 1 ; RV32I-NEXT: srl a7, t1, a7 @@ -1167,42 +1167,42 @@ ; RV64I-NEXT: or a3, a3, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: lbu a4, 13(a0) ; RV64I-NEXT: lbu a5, 12(a0) -; RV64I-NEXT: lbu a7, 14(a0) -; RV64I-NEXT: lbu t0, 15(a0) +; RV64I-NEXT: lbu a6, 14(a0) +; RV64I-NEXT: lbu a7, 15(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 -; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, t0, a4 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: slli a5, a4, 32 ; RV64I-NEXT: or a3, a5, a3 ; RV64I-NEXT: lbu a5, 1(a1) -; RV64I-NEXT: lbu a7, 0(a1) -; RV64I-NEXT: lbu t0, 2(a1) -; RV64I-NEXT: or a3, a3, a6 +; RV64I-NEXT: lbu a6, 0(a1) +; RV64I-NEXT: lbu a7, 2(a1) +; RV64I-NEXT: lbu t0, 3(a1) ; RV64I-NEXT: slli a5, a5, 8 -; RV64I-NEXT: or a5, a5, a7 -; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: or a5, a5, a6 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: lbu a6, 5(a1) -; RV64I-NEXT: lbu a7, 3(a1) -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: lbu t0, 4(a1) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: lbu t1, 6(a1) +; RV64I-NEXT: lbu a7, 4(a1) +; RV64I-NEXT: lbu t0, 6(a1) ; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: or a6, a6, t0 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a6, t1, a6 +; RV64I-NEXT: or a1, a1, t0 ; RV64I-NEXT: or a1, a1, a6 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a5 -; RV64I-NEXT: or a5, a1, a7 +; RV64I-NEXT: or a5, a1, a5 ; RV64I-NEXT: addi a6, a5, -64 ; RV64I-NEXT: sra a1, a3, a5 ; RV64I-NEXT: bltz a6, .LBB8_2 @@ -1215,25 +1215,25 @@ ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a6, 0(a0) ; RV64I-NEXT: lbu a7, 2(a0) +; RV64I-NEXT: lbu t0, 3(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a6 ; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a4, a6, a4 ; RV64I-NEXT: lbu a6, 5(a0) -; RV64I-NEXT: lbu t0, 3(a0) -; RV64I-NEXT: or a4, a7, a4 ; RV64I-NEXT: lbu a7, 4(a0) -; RV64I-NEXT: slli a6, a6, 8 -; RV64I-NEXT: lbu t1, 6(a0) +; RV64I-NEXT: lbu t0, 6(a0) ; RV64I-NEXT: lbu a0, 7(a0) +; RV64I-NEXT: slli a6, a6, 8 ; RV64I-NEXT: or a6, a6, a7 -; RV64I-NEXT: slli t0, t0, 24 -; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: or a6, t1, a6 +; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: or a0, a0, a6 ; RV64I-NEXT: slli a0, a0, 32 ; RV64I-NEXT: or a0, a0, a4 -; RV64I-NEXT: or a0, a0, t0 ; RV64I-NEXT: srl a0, a0, a5 ; RV64I-NEXT: not a4, a5 ; RV64I-NEXT: slli a3, a3, 1 @@ -1306,7 +1306,7 @@ ; RV32I-NEXT: lbu a0, 14(a0) ; RV32I-NEXT: slli s2, s2, 16 ; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or s1, s2, s1 +; RV32I-NEXT: or a1, a1, s2 ; RV32I-NEXT: or a1, a1, s1 ; RV32I-NEXT: sb a3, 23(sp) ; RV32I-NEXT: sb a0, 22(sp) @@ -1356,8 +1356,8 @@ ; RV32I-NEXT: or a0, a0, a4 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a0, a5, a0 -; RV32I-NEXT: or a4, a6, a0 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a4, a4, a0 ; RV32I-NEXT: andi a5, a1, 7 ; RV32I-NEXT: srl a0, a4, a5 ; RV32I-NEXT: lbu a1, 9(a3) @@ -1368,8 +1368,8 @@ ; RV32I-NEXT: or a1, a1, a6 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t0, t0, 24 -; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or a6, t0, a1 +; RV32I-NEXT: or a6, t0, a7 +; RV32I-NEXT: or a6, a6, a1 ; RV32I-NEXT: slli a1, a6, 1 ; RV32I-NEXT: not a7, a5 ; RV32I-NEXT: sll a1, a1, a7 @@ -1382,8 +1382,8 @@ ; RV32I-NEXT: or a7, a7, t0 ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a7, t1, a7 -; RV32I-NEXT: or a7, t2, a7 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or a7, t0, a7 ; RV32I-NEXT: srl a7, a7, a5 ; RV32I-NEXT: slli a4, a4, 1 ; RV32I-NEXT: xori t0, a5, 31 @@ -1398,7 +1398,7 @@ ; RV32I-NEXT: or t1, t1, t2 ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or t1, t3, t1 +; RV32I-NEXT: or a3, a3, t3 ; RV32I-NEXT: or a3, a3, t1 ; RV32I-NEXT: slli t1, a3, 1 ; RV32I-NEXT: sll t0, t1, t0 @@ -1490,32 +1490,32 @@ ; RV64I-NEXT: lbu s5, 17(a0) ; RV64I-NEXT: lbu s6, 18(a0) ; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: lbu s8, 20(a0) ; RV64I-NEXT: lbu s9, 1(a1) ; RV64I-NEXT: lbu s10, 0(a1) ; RV64I-NEXT: lbu s11, 2(a1) -; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu ra, 3(a1) ; RV64I-NEXT: slli s9, s9, 8 ; RV64I-NEXT: or s9, s9, s10 ; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) -; RV64I-NEXT: lbu ra, 4(a1) +; RV64I-NEXT: or s11, ra, s11 ; RV64I-NEXT: or s9, s11, s9 -; RV64I-NEXT: lbu s11, 6(a1) +; RV64I-NEXT: lbu s11, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 -; RV64I-NEXT: or s10, s10, ra -; RV64I-NEXT: lbu ra, 7(a1) -; RV64I-NEXT: slli s11, s11, 16 -; RV64I-NEXT: or s10, s11, s10 +; RV64I-NEXT: lbu ra, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: or s10, s10, s11 ; RV64I-NEXT: lbu s11, 21(a0) -; RV64I-NEXT: slli ra, ra, 24 -; RV64I-NEXT: or s10, ra, s10 +; RV64I-NEXT: slli ra, ra, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, ra ; RV64I-NEXT: lbu ra, 22(a0) -; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli s10, s10, 32 -; RV64I-NEXT: or s9, s10, s9 +; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 23(a0) -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or t0, s9, a1 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or t0, a1, s9 ; RV64I-NEXT: lbu s9, 24(a0) ; RV64I-NEXT: lbu a7, 25(a0) ; RV64I-NEXT: lbu a6, 26(a0) @@ -1546,6 +1546,7 @@ ; RV64I-NEXT: sb s0, 68(sp) ; RV64I-NEXT: sb t6, 67(sp) ; RV64I-NEXT: sb t5, 66(sp) +; RV64I-NEXT: sb t4, 65(sp) ; RV64I-NEXT: sb zero, 119(sp) ; RV64I-NEXT: sb zero, 118(sp) ; RV64I-NEXT: sb zero, 117(sp) @@ -1578,7 +1579,6 @@ ; RV64I-NEXT: sb zero, 90(sp) ; RV64I-NEXT: sb zero, 89(sp) ; RV64I-NEXT: sb zero, 88(sp) -; RV64I-NEXT: sb t4, 65(sp) ; RV64I-NEXT: sb t3, 64(sp) ; RV64I-NEXT: sb t2, 63(sp) ; RV64I-NEXT: sb t1, 62(sp) @@ -1606,20 +1606,20 @@ ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a5, a5, 24 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: lbu a1, 13(a3) ; RV64I-NEXT: lbu a4, 12(a3) -; RV64I-NEXT: lbu a6, 14(a3) -; RV64I-NEXT: lbu a7, 15(a3) +; RV64I-NEXT: lbu a5, 14(a3) +; RV64I-NEXT: lbu a6, 15(a3) ; RV64I-NEXT: slli a1, a1, 8 ; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a1, a6, a1 -; RV64I-NEXT: or a1, a7, a1 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a1, a4, a1 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: or a4, a0, a5 +; RV64I-NEXT: or a4, a1, a0 ; RV64I-NEXT: andi a1, t0, 7 ; RV64I-NEXT: lbu a0, 17(a3) ; RV64I-NEXT: lbu a5, 16(a3) @@ -1629,20 +1629,20 @@ ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a0, a6, a0 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a0, a5, a0 ; RV64I-NEXT: lbu a5, 21(a3) ; RV64I-NEXT: lbu a6, 20(a3) -; RV64I-NEXT: lbu t0, 22(a3) -; RV64I-NEXT: lbu t1, 23(a3) +; RV64I-NEXT: lbu a7, 22(a3) +; RV64I-NEXT: lbu t0, 23(a3) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, t1, a5 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: or a5, a0, a7 +; RV64I-NEXT: or a5, a5, a0 ; RV64I-NEXT: slli a0, a5, 1 ; RV64I-NEXT: not a6, a1 ; RV64I-NEXT: sll a0, a0, a6 @@ -1654,45 +1654,45 @@ ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: lbu a7, 5(a3) ; RV64I-NEXT: lbu t0, 4(a3) -; RV64I-NEXT: lbu t2, 6(a3) -; RV64I-NEXT: lbu t3, 7(a3) +; RV64I-NEXT: lbu t1, 6(a3) +; RV64I-NEXT: lbu t2, 7(a3) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: or a7, t3, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or a7, t0, a7 ; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: lbu a7, 25(a3) ; RV64I-NEXT: lbu t0, 24(a3) -; RV64I-NEXT: lbu t2, 26(a3) -; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: lbu t1, 26(a3) +; RV64I-NEXT: lbu t2, 27(a3) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or a7, t0, a7 ; RV64I-NEXT: lbu t0, 29(a3) -; RV64I-NEXT: or a7, t2, a7 ; RV64I-NEXT: lbu t1, 28(a3) ; RV64I-NEXT: lbu t2, 30(a3) +; RV64I-NEXT: lbu a3, 31(a3) ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: lbu t3, 31(a3) ; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: or t0, t2, t0 -; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: slli a3, a3, 24 +; RV64I-NEXT: or a3, a3, t2 ; RV64I-NEXT: slli t1, a4, 1 -; RV64I-NEXT: lbu a3, 27(a3) -; RV64I-NEXT: slli t0, t0, 32 -; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or a3, a3, t0 ; RV64I-NEXT: xori t0, a1, 63 ; RV64I-NEXT: sll t1, t1, t0 -; RV64I-NEXT: slli a3, a3, 24 -; RV64I-NEXT: or a3, a7, a3 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: or a3, a3, a7 ; RV64I-NEXT: slli a7, a3, 1 ; RV64I-NEXT: sll a7, a7, t0 ; RV64I-NEXT: srl a4, a4, a1 @@ -1822,17 +1822,17 @@ ; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: lbu s8, 1(a1) ; RV32I-NEXT: lbu s9, 20(a0) -; RV32I-NEXT: lbu s10, 0(a1) -; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu s10, 21(a0) +; RV32I-NEXT: lbu s11, 0(a1) ; RV32I-NEXT: slli s8, s8, 8 ; RV32I-NEXT: lbu ra, 2(a1) -; RV32I-NEXT: or s8, s8, s10 -; RV32I-NEXT: lbu s10, 22(a0) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: or s8, s8, s11 +; RV32I-NEXT: lbu s11, 22(a0) ; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: or s8, ra, s8 -; RV32I-NEXT: lbu ra, 23(a0) ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, ra +; RV32I-NEXT: lbu ra, 23(a0) ; RV32I-NEXT: or t0, a1, s8 ; RV32I-NEXT: lbu s8, 24(a0) ; RV32I-NEXT: lbu a7, 25(a0) @@ -1851,8 +1851,8 @@ ; RV32I-NEXT: sb a7, 53(sp) ; RV32I-NEXT: sb s8, 52(sp) ; RV32I-NEXT: sb ra, 51(sp) -; RV32I-NEXT: sb s10, 50(sp) -; RV32I-NEXT: sb s11, 49(sp) +; RV32I-NEXT: sb s11, 50(sp) +; RV32I-NEXT: sb s10, 49(sp) ; RV32I-NEXT: sb s9, 48(sp) ; RV32I-NEXT: sb s7, 47(sp) ; RV32I-NEXT: sb s6, 46(sp) @@ -1924,8 +1924,8 @@ ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: or t4, a5, a0 +; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or t4, a3, a0 ; RV32I-NEXT: andi a3, t0, 7 ; RV32I-NEXT: lbu a0, 9(a4) ; RV32I-NEXT: lbu a1, 8(a4) @@ -1935,8 +1935,8 @@ ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a0, a5, a0 -; RV32I-NEXT: or a6, a6, a0 +; RV32I-NEXT: or a1, a6, a5 +; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: slli a0, a6, 1 ; RV32I-NEXT: not t0, a3 ; RV32I-NEXT: sll a0, a0, t0 @@ -1948,8 +1948,8 @@ ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or t1, t1, a1 +; RV32I-NEXT: or a5, t1, a7 +; RV32I-NEXT: or t1, a5, a1 ; RV32I-NEXT: slli a1, t4, 1 ; RV32I-NEXT: xori t2, a3, 31 ; RV32I-NEXT: sll a1, a1, t2 @@ -1961,8 +1961,8 @@ ; RV32I-NEXT: or a5, a5, a7 ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: or a5, t3, a5 -; RV32I-NEXT: or t3, t5, a5 +; RV32I-NEXT: or a7, t5, t3 +; RV32I-NEXT: or t3, a7, a5 ; RV32I-NEXT: lbu a5, 17(a4) ; RV32I-NEXT: lbu a7, 16(a4) ; RV32I-NEXT: lbu t5, 18(a4) @@ -1971,8 +1971,8 @@ ; RV32I-NEXT: or a5, a5, a7 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a5, t5, a5 -; RV32I-NEXT: or a5, t6, a5 +; RV32I-NEXT: or a7, t6, t5 +; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: slli a7, a5, 1 ; RV32I-NEXT: sll a7, a7, t0 ; RV32I-NEXT: lbu t5, 21(a4) @@ -1983,8 +1983,8 @@ ; RV32I-NEXT: or t5, t5, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 ; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: or t5, s1, t5 ; RV32I-NEXT: lbu t6, 25(a4) ; RV32I-NEXT: lbu s0, 24(a4) ; RV32I-NEXT: lbu s1, 26(a4) @@ -1993,23 +1993,23 @@ ; RV32I-NEXT: or t6, t6, s0 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s1, t6 -; RV32I-NEXT: or t6, s2, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: lbu s0, 29(a4) -; RV32I-NEXT: slli s1, t6, 1 -; RV32I-NEXT: lbu s2, 28(a4) -; RV32I-NEXT: sll t0, s1, t0 +; RV32I-NEXT: lbu s1, 28(a4) +; RV32I-NEXT: slli s2, t6, 1 +; RV32I-NEXT: sll t0, s2, t0 ; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: or s0, s0, s1 ; RV32I-NEXT: lbu s1, 30(a4) -; RV32I-NEXT: or s0, s0, s2 +; RV32I-NEXT: lbu a4, 31(a4) ; RV32I-NEXT: slli s2, t3, 1 ; RV32I-NEXT: sll s2, s2, t2 ; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: lbu a4, 31(a4) -; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: slli a4, a4, 24 +; RV32I-NEXT: or a4, a4, s1 ; RV32I-NEXT: slli s1, t5, 1 ; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: slli a4, a4, 24 ; RV32I-NEXT: or a4, a4, s0 ; RV32I-NEXT: slli s0, a4, 1 ; RV32I-NEXT: sll t2, s0, t2 @@ -2148,32 +2148,32 @@ ; RV64I-NEXT: lbu s5, 17(a0) ; RV64I-NEXT: lbu s6, 18(a0) ; RV64I-NEXT: lbu s7, 19(a0) +; RV64I-NEXT: lbu s8, 20(a0) ; RV64I-NEXT: lbu s9, 1(a1) ; RV64I-NEXT: lbu s10, 0(a1) ; RV64I-NEXT: lbu s11, 2(a1) -; RV64I-NEXT: lbu s8, 20(a0) +; RV64I-NEXT: lbu ra, 3(a1) ; RV64I-NEXT: slli s9, s9, 8 ; RV64I-NEXT: or s9, s9, s10 ; RV64I-NEXT: slli s11, s11, 16 +; RV64I-NEXT: slli ra, ra, 24 ; RV64I-NEXT: lbu s10, 5(a1) -; RV64I-NEXT: lbu ra, 4(a1) +; RV64I-NEXT: or s11, ra, s11 ; RV64I-NEXT: or s9, s11, s9 -; RV64I-NEXT: lbu s11, 6(a1) +; RV64I-NEXT: lbu s11, 4(a1) ; RV64I-NEXT: slli s10, s10, 8 -; RV64I-NEXT: or s10, s10, ra -; RV64I-NEXT: lbu ra, 7(a1) -; RV64I-NEXT: slli s11, s11, 16 -; RV64I-NEXT: or s10, s11, s10 +; RV64I-NEXT: lbu ra, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: or s10, s10, s11 ; RV64I-NEXT: lbu s11, 21(a0) -; RV64I-NEXT: slli ra, ra, 24 -; RV64I-NEXT: or s10, ra, s10 +; RV64I-NEXT: slli ra, ra, 16 +; RV64I-NEXT: slli a1, a1, 24 +; RV64I-NEXT: or a1, a1, ra ; RV64I-NEXT: lbu ra, 22(a0) -; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli s10, s10, 32 -; RV64I-NEXT: or s9, s10, s9 +; RV64I-NEXT: or a1, a1, s10 ; RV64I-NEXT: lbu s10, 23(a0) -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or t0, s9, a1 +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or t0, a1, s9 ; RV64I-NEXT: lbu s9, 24(a0) ; RV64I-NEXT: lbu a7, 25(a0) ; RV64I-NEXT: lbu a6, 26(a0) @@ -2205,6 +2205,7 @@ ; RV64I-NEXT: sb t6, 99(sp) ; RV64I-NEXT: sb t5, 98(sp) ; RV64I-NEXT: sb t4, 97(sp) +; RV64I-NEXT: sb t3, 96(sp) ; RV64I-NEXT: sb zero, 87(sp) ; RV64I-NEXT: sb zero, 86(sp) ; RV64I-NEXT: sb zero, 85(sp) @@ -2237,7 +2238,6 @@ ; RV64I-NEXT: sb zero, 58(sp) ; RV64I-NEXT: sb zero, 57(sp) ; RV64I-NEXT: sb zero, 56(sp) -; RV64I-NEXT: sb t3, 96(sp) ; RV64I-NEXT: sb t2, 95(sp) ; RV64I-NEXT: sb t1, 94(sp) ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload @@ -2264,161 +2264,161 @@ ; RV64I-NEXT: or a1, a1, a3 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a5, a5, 24 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: or a1, a4, a1 ; RV64I-NEXT: lbu a3, 13(a0) ; RV64I-NEXT: lbu a4, 12(a0) -; RV64I-NEXT: lbu a6, 14(a0) -; RV64I-NEXT: lbu a7, 15(a0) +; RV64I-NEXT: lbu a5, 14(a0) +; RV64I-NEXT: lbu a6, 15(a0) ; RV64I-NEXT: slli a3, a3, 8 ; RV64I-NEXT: or a3, a3, a4 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: or a3, a7, a3 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: or a1, a3, a1 -; RV64I-NEXT: or a3, a1, a5 -; RV64I-NEXT: lbu a1, 1(a0) -; RV64I-NEXT: lbu a4, 0(a0) -; RV64I-NEXT: lbu a5, 2(a0) -; RV64I-NEXT: lbu a6, 3(a0) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: or a1, a1, a4 ; RV64I-NEXT: slli a5, a5, 16 ; RV64I-NEXT: slli a6, a6, 24 -; RV64I-NEXT: or a1, a5, a1 -; RV64I-NEXT: lbu a4, 5(a0) -; RV64I-NEXT: lbu a5, 4(a0) -; RV64I-NEXT: lbu a7, 6(a0) -; RV64I-NEXT: lbu t1, 7(a0) +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: or a3, a3, a1 +; RV64I-NEXT: andi a1, t0, 7 +; RV64I-NEXT: lbu a4, 1(a0) +; RV64I-NEXT: lbu a5, 0(a0) +; RV64I-NEXT: lbu a6, 2(a0) +; RV64I-NEXT: lbu a7, 3(a0) ; RV64I-NEXT: slli a4, a4, 8 ; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: slli a6, a6, 16 +; RV64I-NEXT: slli a7, a7, 24 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 5(a0) +; RV64I-NEXT: lbu a6, 4(a0) +; RV64I-NEXT: lbu a7, 6(a0) +; RV64I-NEXT: lbu t0, 7(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a4, t1, a4 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: lbu a4, 25(a0) -; RV64I-NEXT: lbu a5, 24(a0) +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: slli a5, a5, 32 +; RV64I-NEXT: or a4, a5, a4 +; RV64I-NEXT: lbu a5, 25(a0) +; RV64I-NEXT: lbu a6, 24(a0) ; RV64I-NEXT: lbu a7, 26(a0) -; RV64I-NEXT: or a6, a1, a6 -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a5 +; RV64I-NEXT: lbu t0, 27(a0) +; RV64I-NEXT: slli a5, a5, 8 +; RV64I-NEXT: or a5, a5, a6 ; RV64I-NEXT: slli a7, a7, 16 -; RV64I-NEXT: lbu a1, 29(a0) -; RV64I-NEXT: lbu a5, 27(a0) -; RV64I-NEXT: or a4, a7, a4 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 29(a0) ; RV64I-NEXT: lbu a7, 28(a0) -; RV64I-NEXT: slli a1, a1, 8 -; RV64I-NEXT: lbu t1, 30(a0) -; RV64I-NEXT: lbu t2, 31(a0) -; RV64I-NEXT: or a1, a1, a7 -; RV64I-NEXT: slli a5, a5, 24 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or a1, t1, a1 -; RV64I-NEXT: or a1, t2, a1 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: lbu a4, 17(a0) +; RV64I-NEXT: lbu t0, 30(a0) +; RV64I-NEXT: lbu t1, 31(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: or a5, a6, a5 +; RV64I-NEXT: lbu a6, 17(a0) ; RV64I-NEXT: lbu a7, 16(a0) -; RV64I-NEXT: lbu t1, 18(a0) -; RV64I-NEXT: or a5, a1, a5 -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a1, a4, a7 -; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: or a1, t1, a1 -; RV64I-NEXT: lbu a4, 21(a0) -; RV64I-NEXT: lbu a7, 20(a0) +; RV64I-NEXT: lbu t0, 18(a0) +; RV64I-NEXT: lbu t1, 19(a0) +; RV64I-NEXT: slli a6, a6, 8 +; RV64I-NEXT: or a6, a6, a7 +; RV64I-NEXT: slli t0, t0, 16 +; RV64I-NEXT: slli t1, t1, 24 +; RV64I-NEXT: lbu a7, 21(a0) +; RV64I-NEXT: or t0, t1, t0 +; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: lbu t0, 20(a0) +; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: lbu t1, 22(a0) -; RV64I-NEXT: andi t0, t0, 7 -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: or a4, a4, a7 +; RV64I-NEXT: lbu a0, 23(a0) +; RV64I-NEXT: or a7, a7, t0 +; RV64I-NEXT: srli t0, a4, 1 ; RV64I-NEXT: slli t1, t1, 16 -; RV64I-NEXT: lbu a7, 23(a0) -; RV64I-NEXT: or a4, t1, a4 -; RV64I-NEXT: srli t1, a6, 1 -; RV64I-NEXT: lbu t2, 19(a0) -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: xori a7, t0, 63 -; RV64I-NEXT: srl a0, t1, a7 -; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: slli a4, a4, 32 -; RV64I-NEXT: or a1, a4, a1 -; RV64I-NEXT: or a4, a1, t2 -; RV64I-NEXT: srli a1, a4, 1 -; RV64I-NEXT: srl a7, a1, a7 -; RV64I-NEXT: srli a1, a3, 1 -; RV64I-NEXT: not t1, t0 -; RV64I-NEXT: srl t1, a1, t1 -; RV64I-NEXT: sll a1, a3, t0 -; RV64I-NEXT: sll a3, a5, t0 -; RV64I-NEXT: sll a4, a4, t0 -; RV64I-NEXT: sll a5, a6, t0 -; RV64I-NEXT: srli a6, a4, 56 -; RV64I-NEXT: sb a6, 23(a2) -; RV64I-NEXT: srli a6, a4, 48 -; RV64I-NEXT: sb a6, 22(a2) -; RV64I-NEXT: srli a6, a4, 40 -; RV64I-NEXT: sb a6, 21(a2) -; RV64I-NEXT: srli a6, a4, 32 -; RV64I-NEXT: sb a6, 20(a2) -; RV64I-NEXT: srli a6, a4, 24 -; RV64I-NEXT: sb a6, 19(a2) -; RV64I-NEXT: srli a6, a4, 16 -; RV64I-NEXT: sb a6, 18(a2) -; RV64I-NEXT: or a6, a4, t1 -; RV64I-NEXT: srli a4, a4, 8 -; RV64I-NEXT: sb a4, 17(a2) -; RV64I-NEXT: srli a4, a3, 56 -; RV64I-NEXT: sb a4, 31(a2) -; RV64I-NEXT: srli a4, a3, 48 -; RV64I-NEXT: sb a4, 30(a2) -; RV64I-NEXT: srli a4, a3, 40 -; RV64I-NEXT: sb a4, 29(a2) -; RV64I-NEXT: srli a4, a3, 32 -; RV64I-NEXT: sb a4, 28(a2) -; RV64I-NEXT: srli a4, a3, 24 -; RV64I-NEXT: sb a4, 27(a2) -; RV64I-NEXT: srli a4, a3, 16 -; RV64I-NEXT: sb a4, 26(a2) -; RV64I-NEXT: or a4, a3, a7 -; RV64I-NEXT: srli a3, a3, 8 -; RV64I-NEXT: sb a3, 25(a2) -; RV64I-NEXT: srli a3, a5, 56 -; RV64I-NEXT: sb a3, 7(a2) -; RV64I-NEXT: srli a3, a5, 48 -; RV64I-NEXT: sb a3, 6(a2) -; RV64I-NEXT: srli a3, a5, 40 -; RV64I-NEXT: sb a3, 5(a2) -; RV64I-NEXT: srli a3, a5, 32 -; RV64I-NEXT: sb a3, 4(a2) -; RV64I-NEXT: srli a3, a5, 24 -; RV64I-NEXT: sb a3, 3(a2) -; RV64I-NEXT: srli a3, a5, 16 -; RV64I-NEXT: sb a3, 2(a2) -; RV64I-NEXT: sb a5, 0(a2) +; RV64I-NEXT: slli a0, a0, 24 +; RV64I-NEXT: or t1, a0, t1 +; RV64I-NEXT: xori t2, a1, 63 +; RV64I-NEXT: srl a0, t0, t2 +; RV64I-NEXT: or a7, t1, a7 +; RV64I-NEXT: slli a7, a7, 32 +; RV64I-NEXT: or a6, a7, a6 +; RV64I-NEXT: srli a7, a6, 1 +; RV64I-NEXT: srl a7, a7, t2 +; RV64I-NEXT: srli t0, a3, 1 +; RV64I-NEXT: not t1, a1 +; RV64I-NEXT: srl t0, t0, t1 +; RV64I-NEXT: sll a3, a3, a1 +; RV64I-NEXT: sll a5, a5, a1 +; RV64I-NEXT: sll a6, a6, a1 +; RV64I-NEXT: sll a1, a4, a1 +; RV64I-NEXT: srli a4, a6, 56 +; RV64I-NEXT: sb a4, 23(a2) +; RV64I-NEXT: srli a4, a6, 48 +; RV64I-NEXT: sb a4, 22(a2) +; RV64I-NEXT: srli a4, a6, 40 +; RV64I-NEXT: sb a4, 21(a2) +; RV64I-NEXT: srli a4, a6, 32 +; RV64I-NEXT: sb a4, 20(a2) +; RV64I-NEXT: srli a4, a6, 24 +; RV64I-NEXT: sb a4, 19(a2) +; RV64I-NEXT: srli a4, a6, 16 +; RV64I-NEXT: sb a4, 18(a2) +; RV64I-NEXT: or a4, a6, t0 +; RV64I-NEXT: srli a6, a6, 8 +; RV64I-NEXT: sb a6, 17(a2) +; RV64I-NEXT: srli a6, a5, 56 +; RV64I-NEXT: sb a6, 31(a2) +; RV64I-NEXT: srli a6, a5, 48 +; RV64I-NEXT: sb a6, 30(a2) +; RV64I-NEXT: srli a6, a5, 40 +; RV64I-NEXT: sb a6, 29(a2) +; RV64I-NEXT: srli a6, a5, 32 +; RV64I-NEXT: sb a6, 28(a2) +; RV64I-NEXT: srli a6, a5, 24 +; RV64I-NEXT: sb a6, 27(a2) +; RV64I-NEXT: srli a6, a5, 16 +; RV64I-NEXT: sb a6, 26(a2) +; RV64I-NEXT: or a6, a5, a7 ; RV64I-NEXT: srli a5, a5, 8 -; RV64I-NEXT: sb a5, 1(a2) -; RV64I-NEXT: srli a3, a1, 56 -; RV64I-NEXT: sb a3, 15(a2) -; RV64I-NEXT: srli a3, a1, 48 -; RV64I-NEXT: sb a3, 14(a2) -; RV64I-NEXT: srli a3, a1, 40 -; RV64I-NEXT: sb a3, 13(a2) -; RV64I-NEXT: srli a3, a1, 32 -; RV64I-NEXT: sb a3, 12(a2) -; RV64I-NEXT: srli a3, a1, 24 -; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: srli a3, a1, 16 -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: or a0, a1, a0 +; RV64I-NEXT: sb a5, 25(a2) +; RV64I-NEXT: srli a5, a1, 56 +; RV64I-NEXT: sb a5, 7(a2) +; RV64I-NEXT: srli a5, a1, 48 +; RV64I-NEXT: sb a5, 6(a2) +; RV64I-NEXT: srli a5, a1, 40 +; RV64I-NEXT: sb a5, 5(a2) +; RV64I-NEXT: srli a5, a1, 32 +; RV64I-NEXT: sb a5, 4(a2) +; RV64I-NEXT: srli a5, a1, 24 +; RV64I-NEXT: sb a5, 3(a2) +; RV64I-NEXT: srli a5, a1, 16 +; RV64I-NEXT: sb a5, 2(a2) +; RV64I-NEXT: sb a1, 0(a2) ; RV64I-NEXT: srli a1, a1, 8 -; RV64I-NEXT: sb a1, 9(a2) -; RV64I-NEXT: sb a6, 16(a2) -; RV64I-NEXT: sb a4, 24(a2) +; RV64I-NEXT: sb a1, 1(a2) +; RV64I-NEXT: srli a1, a3, 56 +; RV64I-NEXT: sb a1, 15(a2) +; RV64I-NEXT: srli a1, a3, 48 +; RV64I-NEXT: sb a1, 14(a2) +; RV64I-NEXT: srli a1, a3, 40 +; RV64I-NEXT: sb a1, 13(a2) +; RV64I-NEXT: srli a1, a3, 32 +; RV64I-NEXT: sb a1, 12(a2) +; RV64I-NEXT: srli a1, a3, 24 +; RV64I-NEXT: sb a1, 11(a2) +; RV64I-NEXT: srli a1, a3, 16 +; RV64I-NEXT: sb a1, 10(a2) +; RV64I-NEXT: or a0, a3, a0 +; RV64I-NEXT: srli a3, a3, 8 +; RV64I-NEXT: sb a3, 9(a2) +; RV64I-NEXT: sb a4, 16(a2) +; RV64I-NEXT: sb a6, 24(a2) ; RV64I-NEXT: sb a0, 8(a2) ; RV64I-NEXT: ld ra, 216(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 208(sp) # 8-byte Folded Reload @@ -2480,17 +2480,17 @@ ; RV32I-NEXT: lbu s7, 19(a0) ; RV32I-NEXT: lbu s8, 1(a1) ; RV32I-NEXT: lbu s9, 20(a0) -; RV32I-NEXT: lbu s10, 0(a1) -; RV32I-NEXT: lbu s11, 21(a0) +; RV32I-NEXT: lbu s10, 21(a0) +; RV32I-NEXT: lbu s11, 0(a1) ; RV32I-NEXT: slli s8, s8, 8 ; RV32I-NEXT: lbu ra, 2(a1) -; RV32I-NEXT: or s8, s8, s10 -; RV32I-NEXT: lbu s10, 22(a0) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: or s8, s8, s11 +; RV32I-NEXT: lbu s11, 22(a0) ; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: or s8, ra, s8 -; RV32I-NEXT: lbu ra, 23(a0) ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, ra +; RV32I-NEXT: lbu ra, 23(a0) ; RV32I-NEXT: or t0, a1, s8 ; RV32I-NEXT: lbu s8, 24(a0) ; RV32I-NEXT: lbu a7, 25(a0) @@ -2509,8 +2509,8 @@ ; RV32I-NEXT: sb a7, 85(sp) ; RV32I-NEXT: sb s8, 84(sp) ; RV32I-NEXT: sb ra, 83(sp) -; RV32I-NEXT: sb s10, 82(sp) -; RV32I-NEXT: sb s11, 81(sp) +; RV32I-NEXT: sb s11, 82(sp) +; RV32I-NEXT: sb s10, 81(sp) ; RV32I-NEXT: sb s9, 80(sp) ; RV32I-NEXT: sb s7, 79(sp) ; RV32I-NEXT: sb s6, 78(sp) @@ -2582,8 +2582,8 @@ ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a3, a3, 16 ; RV32I-NEXT: slli a4, a4, 24 -; RV32I-NEXT: or a0, a3, a0 -; RV32I-NEXT: or t4, a4, a0 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or t3, a3, a0 ; RV32I-NEXT: andi a1, t0, 7 ; RV32I-NEXT: lbu a0, 1(a5) ; RV32I-NEXT: lbu a3, 0(a5) @@ -2593,46 +2593,45 @@ ; RV32I-NEXT: or a0, a0, a3 ; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: or a6, a6, a0 +; RV32I-NEXT: or a3, a6, a4 +; RV32I-NEXT: or a6, a3, a0 ; RV32I-NEXT: srli a0, a6, 1 -; RV32I-NEXT: xori t0, a1, 31 -; RV32I-NEXT: srl a0, a0, t0 +; RV32I-NEXT: xori a7, a1, 31 +; RV32I-NEXT: srl a0, a0, a7 ; RV32I-NEXT: lbu a3, 13(a5) ; RV32I-NEXT: lbu a4, 12(a5) -; RV32I-NEXT: lbu a7, 14(a5) +; RV32I-NEXT: lbu t0, 14(a5) ; RV32I-NEXT: lbu t1, 15(a5) ; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t0, t0, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a3, a7, a3 -; RV32I-NEXT: or t1, t1, a3 +; RV32I-NEXT: or a4, t1, t0 +; RV32I-NEXT: or t0, a4, a3 ; RV32I-NEXT: lbu a3, 9(a5) ; RV32I-NEXT: lbu a4, 8(a5) -; RV32I-NEXT: lbu a7, 10(a5) +; RV32I-NEXT: lbu t1, 10(a5) ; RV32I-NEXT: lbu t2, 11(a5) ; RV32I-NEXT: slli a3, a3, 8 ; RV32I-NEXT: or a3, a3, a4 -; RV32I-NEXT: slli a7, a7, 16 +; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a3, a7, a3 -; RV32I-NEXT: or t2, t2, a3 -; RV32I-NEXT: srli a3, t2, 1 -; RV32I-NEXT: srl a3, a3, t0 -; RV32I-NEXT: srli a4, t4, 1 -; RV32I-NEXT: not t3, a1 -; RV32I-NEXT: srl a7, a4, t3 -; RV32I-NEXT: lbu a4, 21(a5) +; RV32I-NEXT: or a4, t2, t1 +; RV32I-NEXT: or t1, a4, a3 +; RV32I-NEXT: srli a3, t1, 1 +; RV32I-NEXT: srl a3, a3, a7 +; RV32I-NEXT: srli a4, t3, 1 +; RV32I-NEXT: not t2, a1 +; RV32I-NEXT: lbu t4, 21(a5) ; RV32I-NEXT: lbu t5, 20(a5) ; RV32I-NEXT: lbu t6, 22(a5) ; RV32I-NEXT: lbu s0, 23(a5) -; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a4, a4, t5 +; RV32I-NEXT: slli t4, t4, 8 +; RV32I-NEXT: or t4, t4, t5 ; RV32I-NEXT: slli t6, t6, 16 ; RV32I-NEXT: slli s0, s0, 24 -; RV32I-NEXT: or a4, t6, a4 -; RV32I-NEXT: or a4, s0, a4 +; RV32I-NEXT: or t5, s0, t6 +; RV32I-NEXT: or t4, t5, t4 ; RV32I-NEXT: lbu t5, 17(a5) ; RV32I-NEXT: lbu t6, 16(a5) ; RV32I-NEXT: lbu s0, 18(a5) @@ -2641,8 +2640,8 @@ ; RV32I-NEXT: or t5, t5, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 ; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: or t5, s1, t5 ; RV32I-NEXT: lbu t6, 29(a5) ; RV32I-NEXT: lbu s0, 28(a5) ; RV32I-NEXT: lbu s1, 30(a5) @@ -2651,30 +2650,31 @@ ; RV32I-NEXT: or t6, t6, s0 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: lbu s0, 25(a5) -; RV32I-NEXT: or t6, s1, t6 -; RV32I-NEXT: lbu s1, 24(a5) -; RV32I-NEXT: or t6, s2, t6 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: lbu s2, 26(a5) -; RV32I-NEXT: or s0, s0, s1 -; RV32I-NEXT: srli s1, t5, 1 -; RV32I-NEXT: srl s1, s1, t0 -; RV32I-NEXT: slli s2, s2, 16 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: lbu s1, 25(a5) +; RV32I-NEXT: lbu s2, 24(a5) +; RV32I-NEXT: srl a4, a4, t2 +; RV32I-NEXT: or t6, s0, t6 +; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or s0, s1, s2 +; RV32I-NEXT: lbu s1, 26(a5) ; RV32I-NEXT: lbu a5, 27(a5) -; RV32I-NEXT: or s0, s2, s0 -; RV32I-NEXT: srli s2, t1, 1 -; RV32I-NEXT: srl s2, s2, t3 +; RV32I-NEXT: srli s2, t5, 1 +; RV32I-NEXT: srl s2, s2, a7 +; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli a5, a5, 24 +; RV32I-NEXT: or a5, a5, s1 +; RV32I-NEXT: srli s1, t0, 1 +; RV32I-NEXT: srl s1, s1, t2 ; RV32I-NEXT: or a5, a5, s0 ; RV32I-NEXT: srli s0, a5, 1 -; RV32I-NEXT: srl t0, s0, t0 -; RV32I-NEXT: srli s0, a4, 1 -; RV32I-NEXT: srl t3, s0, t3 -; RV32I-NEXT: sll t4, t4, a1 +; RV32I-NEXT: srl a7, s0, a7 +; RV32I-NEXT: srli s0, t4, 1 +; RV32I-NEXT: srl t2, s0, t2 +; RV32I-NEXT: sll t3, t3, a1 +; RV32I-NEXT: sll t0, t0, a1 ; RV32I-NEXT: sll t1, t1, a1 -; RV32I-NEXT: sll t2, t2, a1 -; RV32I-NEXT: sll a4, a4, a1 +; RV32I-NEXT: sll t4, t4, a1 ; RV32I-NEXT: sll t5, t5, a1 ; RV32I-NEXT: sll t6, t6, a1 ; RV32I-NEXT: sll a5, a5, a1 @@ -2683,62 +2683,62 @@ ; RV32I-NEXT: sb a6, 27(a2) ; RV32I-NEXT: srli a6, a5, 16 ; RV32I-NEXT: sb a6, 26(a2) -; RV32I-NEXT: or a6, a5, t3 +; RV32I-NEXT: or a6, a5, t2 ; RV32I-NEXT: srli a5, a5, 8 ; RV32I-NEXT: sb a5, 25(a2) ; RV32I-NEXT: srli a5, t6, 24 ; RV32I-NEXT: sb a5, 31(a2) ; RV32I-NEXT: srli a5, t6, 16 ; RV32I-NEXT: sb a5, 30(a2) -; RV32I-NEXT: or a5, t6, t0 -; RV32I-NEXT: srli t0, t6, 8 -; RV32I-NEXT: sb t0, 29(a2) -; RV32I-NEXT: srli t0, t5, 24 -; RV32I-NEXT: sb t0, 19(a2) -; RV32I-NEXT: srli t0, t5, 16 -; RV32I-NEXT: sb t0, 18(a2) -; RV32I-NEXT: or t0, t5, s2 -; RV32I-NEXT: srli t3, t5, 8 -; RV32I-NEXT: sb t3, 17(a2) -; RV32I-NEXT: srli t3, a4, 24 -; RV32I-NEXT: sb t3, 23(a2) -; RV32I-NEXT: srli t3, a4, 16 -; RV32I-NEXT: sb t3, 22(a2) -; RV32I-NEXT: or s1, a4, s1 -; RV32I-NEXT: srli a4, a4, 8 -; RV32I-NEXT: sb a4, 21(a2) -; RV32I-NEXT: srli a4, t2, 24 -; RV32I-NEXT: sb a4, 11(a2) -; RV32I-NEXT: srli a4, t2, 16 -; RV32I-NEXT: sb a4, 10(a2) -; RV32I-NEXT: or a4, t2, a7 -; RV32I-NEXT: srli a7, t2, 8 -; RV32I-NEXT: sb a7, 9(a2) -; RV32I-NEXT: srli a7, t1, 24 -; RV32I-NEXT: sb a7, 15(a2) -; RV32I-NEXT: srli a7, t1, 16 -; RV32I-NEXT: sb a7, 14(a2) -; RV32I-NEXT: or a3, t1, a3 -; RV32I-NEXT: srli a7, t1, 8 -; RV32I-NEXT: sb a7, 13(a2) -; RV32I-NEXT: srli a7, a1, 24 -; RV32I-NEXT: sb a7, 3(a2) -; RV32I-NEXT: srli a7, a1, 16 -; RV32I-NEXT: sb a7, 2(a2) +; RV32I-NEXT: or a5, t6, a7 +; RV32I-NEXT: srli a7, t6, 8 +; RV32I-NEXT: sb a7, 29(a2) +; RV32I-NEXT: srli a7, t5, 24 +; RV32I-NEXT: sb a7, 19(a2) +; RV32I-NEXT: srli a7, t5, 16 +; RV32I-NEXT: sb a7, 18(a2) +; RV32I-NEXT: or a7, t5, s1 +; RV32I-NEXT: srli t2, t5, 8 +; RV32I-NEXT: sb t2, 17(a2) +; RV32I-NEXT: srli t2, t4, 24 +; RV32I-NEXT: sb t2, 23(a2) +; RV32I-NEXT: srli t2, t4, 16 +; RV32I-NEXT: sb t2, 22(a2) +; RV32I-NEXT: or t2, t4, s2 +; RV32I-NEXT: srli t4, t4, 8 +; RV32I-NEXT: sb t4, 21(a2) +; RV32I-NEXT: srli t4, t1, 24 +; RV32I-NEXT: sb t4, 11(a2) +; RV32I-NEXT: srli t4, t1, 16 +; RV32I-NEXT: sb t4, 10(a2) +; RV32I-NEXT: or a4, t1, a4 +; RV32I-NEXT: srli t1, t1, 8 +; RV32I-NEXT: sb t1, 9(a2) +; RV32I-NEXT: srli t1, t0, 24 +; RV32I-NEXT: sb t1, 15(a2) +; RV32I-NEXT: srli t1, t0, 16 +; RV32I-NEXT: sb t1, 14(a2) +; RV32I-NEXT: or a3, t0, a3 +; RV32I-NEXT: srli t0, t0, 8 +; RV32I-NEXT: sb t0, 13(a2) +; RV32I-NEXT: srli t0, a1, 24 +; RV32I-NEXT: sb t0, 3(a2) +; RV32I-NEXT: srli t0, a1, 16 +; RV32I-NEXT: sb t0, 2(a2) ; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: srli a1, a1, 8 ; RV32I-NEXT: sb a1, 1(a2) -; RV32I-NEXT: srli a1, t4, 24 +; RV32I-NEXT: srli a1, t3, 24 ; RV32I-NEXT: sb a1, 7(a2) -; RV32I-NEXT: srli a1, t4, 16 +; RV32I-NEXT: srli a1, t3, 16 ; RV32I-NEXT: sb a1, 6(a2) -; RV32I-NEXT: or a0, t4, a0 -; RV32I-NEXT: srli a1, t4, 8 +; RV32I-NEXT: or a0, t3, a0 +; RV32I-NEXT: srli a1, t3, 8 ; RV32I-NEXT: sb a1, 5(a2) ; RV32I-NEXT: sb a6, 24(a2) ; RV32I-NEXT: sb a5, 28(a2) -; RV32I-NEXT: sb t0, 16(a2) -; RV32I-NEXT: sb s1, 20(a2) +; RV32I-NEXT: sb a7, 16(a2) +; RV32I-NEXT: sb t2, 20(a2) ; RV32I-NEXT: sb a4, 8(a2) ; RV32I-NEXT: sb a3, 12(a2) ; RV32I-NEXT: sb a0, 4(a2) @@ -2806,33 +2806,33 @@ ; RV64I-NEXT: lbu s5, 16(a0) ; RV64I-NEXT: lbu s6, 17(a0) ; RV64I-NEXT: lbu s7, 18(a0) -; RV64I-NEXT: lbu s8, 1(a1) -; RV64I-NEXT: lbu s9, 0(a1) -; RV64I-NEXT: lbu s10, 2(a1) -; RV64I-NEXT: lbu s11, 19(a0) -; RV64I-NEXT: slli s8, s8, 8 -; RV64I-NEXT: or s8, s8, s9 -; RV64I-NEXT: slli s10, s10, 16 -; RV64I-NEXT: lbu s9, 5(a1) -; RV64I-NEXT: lbu ra, 4(a1) -; RV64I-NEXT: or s8, s10, s8 -; RV64I-NEXT: lbu s10, 6(a1) +; RV64I-NEXT: lbu s8, 19(a0) +; RV64I-NEXT: lbu s9, 1(a1) +; RV64I-NEXT: lbu s10, 0(a1) +; RV64I-NEXT: lbu s11, 2(a1) +; RV64I-NEXT: lbu ra, 3(a1) ; RV64I-NEXT: slli s9, s9, 8 -; RV64I-NEXT: or s9, s9, ra -; RV64I-NEXT: lbu ra, 7(a1) -; RV64I-NEXT: slli s10, s10, 16 -; RV64I-NEXT: or s9, s10, s9 -; RV64I-NEXT: lbu s10, 20(a0) +; RV64I-NEXT: or s9, s9, s10 +; RV64I-NEXT: slli s11, s11, 16 ; RV64I-NEXT: slli ra, ra, 24 -; RV64I-NEXT: or s9, ra, s9 -; RV64I-NEXT: lbu ra, 21(a0) -; RV64I-NEXT: lbu a1, 3(a1) -; RV64I-NEXT: slli s9, s9, 32 -; RV64I-NEXT: or s8, s9, s8 -; RV64I-NEXT: lbu s9, 22(a0) +; RV64I-NEXT: lbu s10, 5(a1) +; RV64I-NEXT: or s11, ra, s11 +; RV64I-NEXT: or s9, s11, s9 +; RV64I-NEXT: lbu s11, 4(a1) +; RV64I-NEXT: slli s10, s10, 8 +; RV64I-NEXT: lbu ra, 6(a1) +; RV64I-NEXT: lbu a1, 7(a1) +; RV64I-NEXT: or s10, s10, s11 +; RV64I-NEXT: lbu s11, 20(a0) +; RV64I-NEXT: slli ra, ra, 16 ; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or t1, s8, a1 -; RV64I-NEXT: lbu s8, 23(a0) +; RV64I-NEXT: or a1, a1, ra +; RV64I-NEXT: lbu ra, 21(a0) +; RV64I-NEXT: or a1, a1, s10 +; RV64I-NEXT: lbu s10, 22(a0) +; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: or t1, a1, s9 +; RV64I-NEXT: lbu s9, 23(a0) ; RV64I-NEXT: lbu a7, 24(a0) ; RV64I-NEXT: lbu a6, 25(a0) ; RV64I-NEXT: lbu a5, 26(a0) @@ -2847,11 +2847,11 @@ ; RV64I-NEXT: sb a5, 82(sp) ; RV64I-NEXT: sb a6, 81(sp) ; RV64I-NEXT: sb a7, 80(sp) -; RV64I-NEXT: sb s8, 79(sp) -; RV64I-NEXT: sb s9, 78(sp) +; RV64I-NEXT: sb s9, 79(sp) +; RV64I-NEXT: sb s10, 78(sp) ; RV64I-NEXT: sb ra, 77(sp) -; RV64I-NEXT: sb s10, 76(sp) -; RV64I-NEXT: sb s11, 75(sp) +; RV64I-NEXT: sb s11, 76(sp) +; RV64I-NEXT: sb s8, 75(sp) ; RV64I-NEXT: sb s7, 74(sp) ; RV64I-NEXT: sb s6, 73(sp) ; RV64I-NEXT: sb s5, 72(sp) @@ -2862,9 +2862,9 @@ ; RV64I-NEXT: sb s0, 67(sp) ; RV64I-NEXT: sb t6, 66(sp) ; RV64I-NEXT: sb t5, 65(sp) +; RV64I-NEXT: sb t4, 64(sp) ; RV64I-NEXT: sb t0, 87(sp) ; RV64I-NEXT: slli t0, t0, 56 -; RV64I-NEXT: sb t4, 64(sp) ; RV64I-NEXT: sb t3, 63(sp) ; RV64I-NEXT: sb t2, 62(sp) ; RV64I-NEXT: ld a0, 8(sp) # 8-byte Folded Reload @@ -2931,20 +2931,20 @@ ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: slli a4, a4, 16 ; RV64I-NEXT: slli a5, a5, 24 +; RV64I-NEXT: or a4, a5, a4 ; RV64I-NEXT: or a0, a4, a0 ; RV64I-NEXT: lbu a1, 13(a3) ; RV64I-NEXT: lbu a4, 12(a3) -; RV64I-NEXT: lbu a6, 14(a3) -; RV64I-NEXT: lbu a7, 15(a3) +; RV64I-NEXT: lbu a5, 14(a3) +; RV64I-NEXT: lbu a6, 15(a3) ; RV64I-NEXT: slli a1, a1, 8 ; RV64I-NEXT: or a1, a1, a4 -; RV64I-NEXT: slli a6, a6, 16 -; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a1, a6, a1 -; RV64I-NEXT: or a1, a7, a1 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a1, a4, a1 ; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a0, a1, a0 -; RV64I-NEXT: or a4, a0, a5 +; RV64I-NEXT: or a4, a1, a0 ; RV64I-NEXT: andi a1, t1, 7 ; RV64I-NEXT: lbu a0, 17(a3) ; RV64I-NEXT: lbu a5, 16(a3) @@ -2954,20 +2954,20 @@ ; RV64I-NEXT: or a0, a0, a5 ; RV64I-NEXT: slli a6, a6, 16 ; RV64I-NEXT: slli a7, a7, 24 -; RV64I-NEXT: or a0, a6, a0 +; RV64I-NEXT: or a5, a7, a6 +; RV64I-NEXT: or a0, a5, a0 ; RV64I-NEXT: lbu a5, 21(a3) ; RV64I-NEXT: lbu a6, 20(a3) -; RV64I-NEXT: lbu t0, 22(a3) -; RV64I-NEXT: lbu t1, 23(a3) +; RV64I-NEXT: lbu a7, 22(a3) +; RV64I-NEXT: lbu t0, 23(a3) ; RV64I-NEXT: slli a5, a5, 8 ; RV64I-NEXT: or a5, a5, a6 -; RV64I-NEXT: slli t0, t0, 16 -; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a5, t0, a5 -; RV64I-NEXT: or a5, t1, a5 +; RV64I-NEXT: slli a7, a7, 16 +; RV64I-NEXT: slli t0, t0, 24 +; RV64I-NEXT: or a6, t0, a7 +; RV64I-NEXT: or a5, a6, a5 ; RV64I-NEXT: slli a5, a5, 32 -; RV64I-NEXT: or a0, a5, a0 -; RV64I-NEXT: or a5, a0, a7 +; RV64I-NEXT: or a5, a5, a0 ; RV64I-NEXT: slli a0, a5, 1 ; RV64I-NEXT: not a6, a1 ; RV64I-NEXT: sll a0, a0, a6 @@ -2979,45 +2979,45 @@ ; RV64I-NEXT: or a6, a6, a7 ; RV64I-NEXT: slli t0, t0, 16 ; RV64I-NEXT: slli t1, t1, 24 -; RV64I-NEXT: or a6, t0, a6 +; RV64I-NEXT: or a7, t1, t0 +; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: lbu a7, 5(a3) ; RV64I-NEXT: lbu t0, 4(a3) -; RV64I-NEXT: lbu t2, 6(a3) -; RV64I-NEXT: lbu t3, 7(a3) +; RV64I-NEXT: lbu t1, 6(a3) +; RV64I-NEXT: lbu t2, 7(a3) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or a7, t2, a7 -; RV64I-NEXT: or a7, t3, a7 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or a7, t0, a7 ; RV64I-NEXT: slli a7, a7, 32 ; RV64I-NEXT: or a6, a7, a6 ; RV64I-NEXT: lbu a7, 25(a3) ; RV64I-NEXT: lbu t0, 24(a3) -; RV64I-NEXT: lbu t2, 26(a3) -; RV64I-NEXT: or a6, a6, t1 +; RV64I-NEXT: lbu t1, 26(a3) +; RV64I-NEXT: lbu t2, 27(a3) ; RV64I-NEXT: slli a7, a7, 8 ; RV64I-NEXT: or a7, a7, t0 -; RV64I-NEXT: slli t2, t2, 16 +; RV64I-NEXT: slli t1, t1, 16 +; RV64I-NEXT: slli t2, t2, 24 +; RV64I-NEXT: or t0, t2, t1 +; RV64I-NEXT: or a7, t0, a7 ; RV64I-NEXT: lbu t0, 29(a3) -; RV64I-NEXT: or a7, t2, a7 ; RV64I-NEXT: lbu t1, 28(a3) ; RV64I-NEXT: lbu t2, 30(a3) +; RV64I-NEXT: lbu a3, 31(a3) ; RV64I-NEXT: slli t0, t0, 8 -; RV64I-NEXT: lbu t3, 31(a3) ; RV64I-NEXT: or t0, t0, t1 ; RV64I-NEXT: slli t2, t2, 16 -; RV64I-NEXT: or t0, t2, t0 -; RV64I-NEXT: slli t3, t3, 24 -; RV64I-NEXT: or t0, t3, t0 +; RV64I-NEXT: slli a3, a3, 24 +; RV64I-NEXT: or a3, a3, t2 ; RV64I-NEXT: slli t1, a4, 1 -; RV64I-NEXT: lbu a3, 27(a3) -; RV64I-NEXT: slli t0, t0, 32 -; RV64I-NEXT: or a7, t0, a7 +; RV64I-NEXT: or a3, a3, t0 ; RV64I-NEXT: xori t0, a1, 63 ; RV64I-NEXT: sll t1, t1, t0 -; RV64I-NEXT: slli a3, a3, 24 -; RV64I-NEXT: or a3, a7, a3 +; RV64I-NEXT: slli a3, a3, 32 +; RV64I-NEXT: or a3, a3, a7 ; RV64I-NEXT: slli a7, a3, 1 ; RV64I-NEXT: sll a7, a7, t0 ; RV64I-NEXT: srl a4, a4, a1 @@ -3147,17 +3147,17 @@ ; RV32I-NEXT: lbu s8, 18(a0) ; RV32I-NEXT: lbu a4, 1(a1) ; RV32I-NEXT: lbu s9, 19(a0) -; RV32I-NEXT: lbu s10, 0(a1) -; RV32I-NEXT: lbu s11, 20(a0) +; RV32I-NEXT: lbu s10, 20(a0) +; RV32I-NEXT: lbu s11, 0(a1) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: lbu ra, 2(a1) -; RV32I-NEXT: or a4, a4, s10 -; RV32I-NEXT: lbu s10, 21(a0) ; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: or a4, a4, s11 +; RV32I-NEXT: lbu s11, 21(a0) ; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: or a4, ra, a4 -; RV32I-NEXT: lbu ra, 22(a0) ; RV32I-NEXT: slli a1, a1, 24 +; RV32I-NEXT: or a1, a1, ra +; RV32I-NEXT: lbu ra, 22(a0) ; RV32I-NEXT: or t1, a1, a4 ; RV32I-NEXT: lbu t0, 23(a0) ; RV32I-NEXT: lbu a7, 24(a0) @@ -3176,8 +3176,8 @@ ; RV32I-NEXT: sb a7, 52(sp) ; RV32I-NEXT: sb t0, 51(sp) ; RV32I-NEXT: sb ra, 50(sp) -; RV32I-NEXT: sb s10, 49(sp) -; RV32I-NEXT: sb s11, 48(sp) +; RV32I-NEXT: sb s11, 49(sp) +; RV32I-NEXT: sb s10, 48(sp) ; RV32I-NEXT: sb s9, 47(sp) ; RV32I-NEXT: sb s8, 46(sp) ; RV32I-NEXT: sb s7, 45(sp) @@ -3254,8 +3254,8 @@ ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a4, a4, 16 ; RV32I-NEXT: slli a5, a5, 24 -; RV32I-NEXT: or a0, a4, a0 -; RV32I-NEXT: or t4, a5, a0 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: or t4, a4, a0 ; RV32I-NEXT: andi a4, t1, 7 ; RV32I-NEXT: lbu a0, 9(a3) ; RV32I-NEXT: lbu a1, 8(a3) @@ -3265,8 +3265,8 @@ ; RV32I-NEXT: or a0, a0, a1 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a0, a5, a0 -; RV32I-NEXT: or a6, a6, a0 +; RV32I-NEXT: or a1, a6, a5 +; RV32I-NEXT: or a6, a1, a0 ; RV32I-NEXT: slli a0, a6, 1 ; RV32I-NEXT: not t0, a4 ; RV32I-NEXT: sll a0, a0, t0 @@ -3278,8 +3278,8 @@ ; RV32I-NEXT: or a1, a1, a5 ; RV32I-NEXT: slli a7, a7, 16 ; RV32I-NEXT: slli t1, t1, 24 -; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or t1, t1, a1 +; RV32I-NEXT: or a5, t1, a7 +; RV32I-NEXT: or t1, a5, a1 ; RV32I-NEXT: slli a1, t4, 1 ; RV32I-NEXT: xori t2, a4, 31 ; RV32I-NEXT: sll a1, a1, t2 @@ -3291,8 +3291,8 @@ ; RV32I-NEXT: or a5, a5, a7 ; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: or a5, t3, a5 -; RV32I-NEXT: or t3, t5, a5 +; RV32I-NEXT: or a7, t5, t3 +; RV32I-NEXT: or t3, a7, a5 ; RV32I-NEXT: lbu a5, 17(a3) ; RV32I-NEXT: lbu a7, 16(a3) ; RV32I-NEXT: lbu t5, 18(a3) @@ -3301,8 +3301,8 @@ ; RV32I-NEXT: or a5, a5, a7 ; RV32I-NEXT: slli t5, t5, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a5, t5, a5 -; RV32I-NEXT: or a5, t6, a5 +; RV32I-NEXT: or a7, t6, t5 +; RV32I-NEXT: or a5, a7, a5 ; RV32I-NEXT: slli a7, a5, 1 ; RV32I-NEXT: sll a7, a7, t0 ; RV32I-NEXT: lbu t5, 21(a3) @@ -3313,8 +3313,8 @@ ; RV32I-NEXT: or t5, t5, t6 ; RV32I-NEXT: slli s0, s0, 16 ; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: or s0, s1, s0 ; RV32I-NEXT: or t5, s0, t5 -; RV32I-NEXT: or t5, s1, t5 ; RV32I-NEXT: lbu t6, 25(a3) ; RV32I-NEXT: lbu s0, 24(a3) ; RV32I-NEXT: lbu s1, 26(a3) @@ -3323,23 +3323,23 @@ ; RV32I-NEXT: or t6, t6, s0 ; RV32I-NEXT: slli s1, s1, 16 ; RV32I-NEXT: slli s2, s2, 24 -; RV32I-NEXT: or t6, s1, t6 -; RV32I-NEXT: or t6, s2, t6 +; RV32I-NEXT: or s0, s2, s1 +; RV32I-NEXT: or t6, s0, t6 ; RV32I-NEXT: lbu s0, 29(a3) -; RV32I-NEXT: slli s1, t6, 1 -; RV32I-NEXT: lbu s2, 28(a3) -; RV32I-NEXT: sll t0, s1, t0 +; RV32I-NEXT: lbu s1, 28(a3) +; RV32I-NEXT: slli s2, t6, 1 +; RV32I-NEXT: sll t0, s2, t0 ; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: or s0, s0, s1 ; RV32I-NEXT: lbu s1, 30(a3) -; RV32I-NEXT: or s0, s0, s2 +; RV32I-NEXT: lbu a3, 31(a3) ; RV32I-NEXT: slli s2, t3, 1 ; RV32I-NEXT: sll s2, s2, t2 ; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: lbu a3, 31(a3) -; RV32I-NEXT: or s0, s1, s0 +; RV32I-NEXT: slli a3, a3, 24 +; RV32I-NEXT: or a3, a3, s1 ; RV32I-NEXT: slli s1, t5, 1 ; RV32I-NEXT: sll s1, s1, t2 -; RV32I-NEXT: slli a3, a3, 24 ; RV32I-NEXT: or a3, a3, s0 ; RV32I-NEXT: slli s0, a3, 1 ; RV32I-NEXT: sll t2, s0, t2 diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -187,10 +187,10 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32-LABEL: saddo1.i64: ; RV32: # %bb.0: # %entry +; RV32-NEXT: add a5, a1, a3 ; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: sltu a0, a2, a0 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: add a5, a1, a0 +; RV32-NEXT: add a5, a5, a0 ; RV32-NEXT: xor a0, a1, a5 ; RV32-NEXT: xor a1, a1, a3 ; RV32-NEXT: not a1, a1 @@ -211,10 +211,10 @@ ; ; RV32ZBA-LABEL: saddo1.i64: ; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a5, a1, a3 ; RV32ZBA-NEXT: add a2, a0, a2 ; RV32ZBA-NEXT: sltu a0, a2, a0 -; RV32ZBA-NEXT: add a0, a3, a0 -; RV32ZBA-NEXT: add a5, a1, a0 +; RV32ZBA-NEXT: add a5, a5, a0 ; RV32ZBA-NEXT: xor a0, a1, a5 ; RV32ZBA-NEXT: xor a1, a1, a3 ; RV32ZBA-NEXT: not a1, a1 @@ -449,10 +449,10 @@ define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32-LABEL: uaddo.i64: ; RV32: # %bb.0: # %entry +; RV32-NEXT: add a3, a1, a3 ; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: add a3, a3, a0 -; RV32-NEXT: add a3, a1, a3 ; RV32-NEXT: beq a3, a1, .LBB10_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a3, a1 @@ -470,10 +470,10 @@ ; ; RV32ZBA-LABEL: uaddo.i64: ; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a3, a1, a3 ; RV32ZBA-NEXT: add a2, a0, a2 ; RV32ZBA-NEXT: sltu a0, a2, a0 ; RV32ZBA-NEXT: add a3, a3, a0 -; RV32ZBA-NEXT: add a3, a1, a3 ; RV32ZBA-NEXT: beq a3, a1, .LBB10_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a3, a1 @@ -634,8 +634,8 @@ ; RV32-LABEL: ssubo.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: sltu a5, a0, a2 -; RV32-NEXT: add a5, a3, a5 -; RV32-NEXT: sub a5, a1, a5 +; RV32-NEXT: sub a6, a1, a3 +; RV32-NEXT: sub a5, a6, a5 ; RV32-NEXT: xor a6, a1, a5 ; RV32-NEXT: xor a1, a1, a3 ; RV32-NEXT: and a1, a1, a6 @@ -658,8 +658,8 @@ ; RV32ZBA-LABEL: ssubo.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sltu a5, a0, a2 -; RV32ZBA-NEXT: add a5, a3, a5 -; RV32ZBA-NEXT: sub a5, a1, a5 +; RV32ZBA-NEXT: sub a6, a1, a3 +; RV32ZBA-NEXT: sub a5, a6, a5 ; RV32ZBA-NEXT: xor a6, a1, a5 ; RV32ZBA-NEXT: xor a1, a1, a3 ; RV32ZBA-NEXT: and a1, a1, a6 @@ -806,8 +806,8 @@ ; RV32-LABEL: usubo.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: sltu a5, a0, a2 -; RV32-NEXT: add a3, a3, a5 ; RV32-NEXT: sub a3, a1, a3 +; RV32-NEXT: sub a3, a3, a5 ; RV32-NEXT: sub a2, a0, a2 ; RV32-NEXT: beq a3, a1, .LBB18_2 ; RV32-NEXT: # %bb.1: # %entry @@ -830,8 +830,8 @@ ; RV32ZBA-LABEL: usubo.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sltu a5, a0, a2 -; RV32ZBA-NEXT: add a3, a3, a5 ; RV32ZBA-NEXT: sub a3, a1, a3 +; RV32ZBA-NEXT: sub a3, a3, a5 ; RV32ZBA-NEXT: sub a2, a0, a2 ; RV32ZBA-NEXT: beq a3, a1, .LBB18_2 ; RV32ZBA-NEXT: # %bb.1: # %entry @@ -987,21 +987,21 @@ ; RV32-NEXT: sltu t0, t1, t0 ; RV32-NEXT: sltu a6, a7, a6 ; RV32-NEXT: mulhu a7, a1, a3 +; RV32-NEXT: add a6, a7, a6 ; RV32-NEXT: add a6, a6, t0 -; RV32-NEXT: mulhu t0, a2, t2 +; RV32-NEXT: mulhu a7, a2, t2 +; RV32-NEXT: add a7, a7, t3 ; RV32-NEXT: mul a3, a3, t2 -; RV32-NEXT: add a3, t3, a3 -; RV32-NEXT: add a3, t0, a3 +; RV32-NEXT: add a3, a7, a3 ; RV32-NEXT: mul a1, t4, a1 -; RV32-NEXT: mulhu t0, t4, a0 +; RV32-NEXT: mulhu a7, t4, a0 +; RV32-NEXT: add a1, a7, a1 ; RV32-NEXT: add a1, a1, t5 ; RV32-NEXT: add a1, a1, a3 ; RV32-NEXT: sltu a3, t6, t5 ; RV32-NEXT: add a1, a1, a3 -; RV32-NEXT: add a1, t0, a1 ; RV32-NEXT: add a1, a6, a1 ; RV32-NEXT: add a1, a1, s1 -; RV32-NEXT: add a1, a7, a1 ; RV32-NEXT: srai a3, a5, 31 ; RV32-NEXT: xor a1, a1, a3 ; RV32-NEXT: xor a3, s0, a3 @@ -1058,21 +1058,21 @@ ; RV32ZBA-NEXT: sltu t0, t1, t0 ; RV32ZBA-NEXT: sltu a6, a7, a6 ; RV32ZBA-NEXT: mulhu a7, a1, a3 +; RV32ZBA-NEXT: add a6, a7, a6 ; RV32ZBA-NEXT: add a6, a6, t0 -; RV32ZBA-NEXT: mulhu t0, a2, t2 +; RV32ZBA-NEXT: mulhu a7, a2, t2 +; RV32ZBA-NEXT: add a7, a7, t3 ; RV32ZBA-NEXT: mul a3, a3, t2 -; RV32ZBA-NEXT: add a3, t3, a3 -; RV32ZBA-NEXT: add a3, t0, a3 +; RV32ZBA-NEXT: add a3, a7, a3 ; RV32ZBA-NEXT: mul a1, t4, a1 -; RV32ZBA-NEXT: mulhu t0, t4, a0 +; RV32ZBA-NEXT: mulhu a7, t4, a0 +; RV32ZBA-NEXT: add a1, a7, a1 ; RV32ZBA-NEXT: add a1, a1, t5 ; RV32ZBA-NEXT: add a1, a1, a3 ; RV32ZBA-NEXT: sltu a3, t6, t5 ; RV32ZBA-NEXT: add a1, a1, a3 -; RV32ZBA-NEXT: add a1, t0, a1 ; RV32ZBA-NEXT: add a1, a6, a1 ; RV32ZBA-NEXT: add a1, a1, s1 -; RV32ZBA-NEXT: add a1, a7, a1 ; RV32ZBA-NEXT: srai a3, a5, 31 ; RV32ZBA-NEXT: xor a1, a1, a3 ; RV32ZBA-NEXT: xor a3, s0, a3 @@ -1335,20 +1335,20 @@ ; RV32: # %bb.0: # %entry ; RV32-NEXT: mul a5, a3, a0 ; RV32-NEXT: mul a6, a1, a2 -; RV32-NEXT: mulhu a7, a0, a2 -; RV32-NEXT: add a5, a7, a5 -; RV32-NEXT: add a5, a5, a6 -; RV32-NEXT: sltu a6, a5, a7 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: mulhu a6, a0, a2 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: sltu a6, a5, a6 ; RV32-NEXT: snez a7, a3 ; RV32-NEXT: snez t0, a1 ; RV32-NEXT: and a7, t0, a7 ; RV32-NEXT: mulhu a1, a1, a2 ; RV32-NEXT: snez a1, a1 +; RV32-NEXT: or a1, a7, a1 ; RV32-NEXT: mulhu a3, a3, a0 ; RV32-NEXT: snez a3, a3 ; RV32-NEXT: or a1, a1, a3 ; RV32-NEXT: or a1, a1, a6 -; RV32-NEXT: or a1, a7, a1 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: sw a0, 0(a4) ; RV32-NEXT: sw a5, 4(a4) @@ -1368,20 +1368,20 @@ ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mul a5, a3, a0 ; RV32ZBA-NEXT: mul a6, a1, a2 -; RV32ZBA-NEXT: mulhu a7, a0, a2 -; RV32ZBA-NEXT: add a5, a7, a5 -; RV32ZBA-NEXT: add a5, a5, a6 -; RV32ZBA-NEXT: sltu a6, a5, a7 +; RV32ZBA-NEXT: add a5, a6, a5 +; RV32ZBA-NEXT: mulhu a6, a0, a2 +; RV32ZBA-NEXT: add a5, a6, a5 +; RV32ZBA-NEXT: sltu a6, a5, a6 ; RV32ZBA-NEXT: snez a7, a3 ; RV32ZBA-NEXT: snez t0, a1 ; RV32ZBA-NEXT: and a7, t0, a7 ; RV32ZBA-NEXT: mulhu a1, a1, a2 ; RV32ZBA-NEXT: snez a1, a1 +; RV32ZBA-NEXT: or a1, a7, a1 ; RV32ZBA-NEXT: mulhu a3, a3, a0 ; RV32ZBA-NEXT: snez a3, a3 ; RV32ZBA-NEXT: or a1, a1, a3 ; RV32ZBA-NEXT: or a1, a1, a6 -; RV32ZBA-NEXT: or a1, a7, a1 ; RV32ZBA-NEXT: mul a0, a0, a2 ; RV32ZBA-NEXT: sw a0, 0(a4) ; RV32ZBA-NEXT: sw a5, 4(a4) @@ -1561,10 +1561,10 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) { ; RV32-LABEL: saddo.select.i64: ; RV32: # %bb.0: # %entry -; RV32-NEXT: add a4, a0, a2 -; RV32-NEXT: sltu a4, a4, a0 -; RV32-NEXT: add a4, a3, a4 -; RV32-NEXT: add a4, a1, a4 +; RV32-NEXT: add a4, a1, a3 +; RV32-NEXT: add a5, a0, a2 +; RV32-NEXT: sltu a5, a5, a0 +; RV32-NEXT: add a4, a4, a5 ; RV32-NEXT: xor a4, a1, a4 ; RV32-NEXT: xor a5, a1, a3 ; RV32-NEXT: not a5, a5 @@ -1589,10 +1589,10 @@ ; ; RV32ZBA-LABEL: saddo.select.i64: ; RV32ZBA: # %bb.0: # %entry -; RV32ZBA-NEXT: add a4, a0, a2 -; RV32ZBA-NEXT: sltu a4, a4, a0 -; RV32ZBA-NEXT: add a4, a3, a4 -; RV32ZBA-NEXT: add a4, a1, a4 +; RV32ZBA-NEXT: add a4, a1, a3 +; RV32ZBA-NEXT: add a5, a0, a2 +; RV32ZBA-NEXT: sltu a5, a5, a0 +; RV32ZBA-NEXT: add a4, a4, a5 ; RV32ZBA-NEXT: xor a4, a1, a4 ; RV32ZBA-NEXT: xor a5, a1, a3 ; RV32ZBA-NEXT: not a5, a5 @@ -1624,10 +1624,10 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) { ; RV32-LABEL: saddo.not.i64: ; RV32: # %bb.0: # %entry +; RV32-NEXT: add a4, a1, a3 ; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: sltu a0, a2, a0 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: add a0, a4, a0 ; RV32-NEXT: xor a0, a1, a0 ; RV32-NEXT: xor a1, a1, a3 ; RV32-NEXT: not a1, a1 @@ -1647,10 +1647,10 @@ ; ; RV32ZBA-LABEL: saddo.not.i64: ; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a4, a1, a3 ; RV32ZBA-NEXT: add a2, a0, a2 ; RV32ZBA-NEXT: sltu a0, a2, a0 -; RV32ZBA-NEXT: add a0, a3, a0 -; RV32ZBA-NEXT: add a0, a1, a0 +; RV32ZBA-NEXT: add a0, a4, a0 ; RV32ZBA-NEXT: xor a0, a1, a0 ; RV32ZBA-NEXT: xor a1, a1, a3 ; RV32ZBA-NEXT: not a1, a1 @@ -1755,10 +1755,10 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { ; RV32-LABEL: uaddo.select.i64: ; RV32: # %bb.0: # %entry +; RV32-NEXT: add a5, a1, a3 ; RV32-NEXT: add a4, a0, a2 ; RV32-NEXT: sltu a4, a4, a0 -; RV32-NEXT: add a5, a3, a4 -; RV32-NEXT: add a5, a1, a5 +; RV32-NEXT: add a5, a5, a4 ; RV32-NEXT: bne a5, a1, .LBB34_3 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: beqz a4, .LBB34_4 @@ -1783,10 +1783,10 @@ ; ; RV32ZBA-LABEL: uaddo.select.i64: ; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a5, a1, a3 ; RV32ZBA-NEXT: add a4, a0, a2 ; RV32ZBA-NEXT: sltu a4, a4, a0 -; RV32ZBA-NEXT: add a5, a3, a4 -; RV32ZBA-NEXT: add a5, a1, a5 +; RV32ZBA-NEXT: add a5, a5, a4 ; RV32ZBA-NEXT: bne a5, a1, .LBB34_3 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: beqz a4, .LBB34_4 @@ -1818,10 +1818,10 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) { ; RV32-LABEL: uaddo.not.i64: ; RV32: # %bb.0: # %entry +; RV32-NEXT: add a3, a1, a3 ; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: add a2, a3, a0 -; RV32-NEXT: add a2, a1, a2 ; RV32-NEXT: beq a2, a1, .LBB35_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a2, a1 @@ -1838,10 +1838,10 @@ ; ; RV32ZBA-LABEL: uaddo.not.i64: ; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a3, a1, a3 ; RV32ZBA-NEXT: add a2, a0, a2 ; RV32ZBA-NEXT: sltu a0, a2, a0 ; RV32ZBA-NEXT: add a2, a3, a0 -; RV32ZBA-NEXT: add a2, a1, a2 ; RV32ZBA-NEXT: beq a2, a1, .LBB35_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a2, a1 @@ -1956,11 +1956,11 @@ ; RV32-LABEL: ssubo.select.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: sltu a4, a0, a2 -; RV32-NEXT: add a4, a3, a4 -; RV32-NEXT: sub a4, a1, a4 -; RV32-NEXT: xor a4, a1, a4 -; RV32-NEXT: xor a5, a1, a3 -; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: sub a5, a1, a3 +; RV32-NEXT: sub a5, a5, a4 +; RV32-NEXT: xor a5, a1, a5 +; RV32-NEXT: xor a4, a1, a3 +; RV32-NEXT: and a4, a4, a5 ; RV32-NEXT: bltz a4, .LBB38_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a2 @@ -1982,11 +1982,11 @@ ; RV32ZBA-LABEL: ssubo.select.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sltu a4, a0, a2 -; RV32ZBA-NEXT: add a4, a3, a4 -; RV32ZBA-NEXT: sub a4, a1, a4 -; RV32ZBA-NEXT: xor a4, a1, a4 -; RV32ZBA-NEXT: xor a5, a1, a3 -; RV32ZBA-NEXT: and a4, a5, a4 +; RV32ZBA-NEXT: sub a5, a1, a3 +; RV32ZBA-NEXT: sub a5, a5, a4 +; RV32ZBA-NEXT: xor a5, a1, a5 +; RV32ZBA-NEXT: xor a4, a1, a3 +; RV32ZBA-NEXT: and a4, a4, a5 ; RV32ZBA-NEXT: bltz a4, .LBB38_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a2 @@ -2015,12 +2015,12 @@ ; RV32-LABEL: ssub.not.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: sltu a0, a0, a2 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: sub a0, a1, a0 -; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sub a2, a1, a3 +; RV32-NEXT: sub a2, a2, a0 +; RV32-NEXT: xor a2, a1, a2 ; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: slti a0, a0, 0 +; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: slti a0, a1, 0 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: ret ; @@ -2036,12 +2036,12 @@ ; RV32ZBA-LABEL: ssub.not.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sltu a0, a0, a2 -; RV32ZBA-NEXT: add a0, a3, a0 -; RV32ZBA-NEXT: sub a0, a1, a0 -; RV32ZBA-NEXT: xor a0, a1, a0 +; RV32ZBA-NEXT: sub a2, a1, a3 +; RV32ZBA-NEXT: sub a2, a2, a0 +; RV32ZBA-NEXT: xor a2, a1, a2 ; RV32ZBA-NEXT: xor a1, a1, a3 -; RV32ZBA-NEXT: and a0, a1, a0 -; RV32ZBA-NEXT: slti a0, a0, 0 +; RV32ZBA-NEXT: and a1, a1, a2 +; RV32ZBA-NEXT: slti a0, a1, 0 ; RV32ZBA-NEXT: xori a0, a0, 1 ; RV32ZBA-NEXT: ret ; @@ -2142,8 +2142,8 @@ ; RV32-LABEL: usubo.select.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: sltu a4, a0, a2 -; RV32-NEXT: add a4, a3, a4 -; RV32-NEXT: sub a4, a1, a4 +; RV32-NEXT: sub a5, a1, a3 +; RV32-NEXT: sub a4, a5, a4 ; RV32-NEXT: beq a4, a1, .LBB42_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a4, a1, a4 @@ -2171,8 +2171,8 @@ ; RV32ZBA-LABEL: usubo.select.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sltu a4, a0, a2 -; RV32ZBA-NEXT: add a4, a3, a4 -; RV32ZBA-NEXT: sub a4, a1, a4 +; RV32ZBA-NEXT: sub a5, a1, a3 +; RV32ZBA-NEXT: sub a4, a5, a4 ; RV32ZBA-NEXT: beq a4, a1, .LBB42_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a4, a1, a4 @@ -2207,8 +2207,8 @@ ; RV32-LABEL: usubo.not.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: sltu a4, a0, a2 -; RV32-NEXT: add a3, a3, a4 ; RV32-NEXT: sub a3, a1, a3 +; RV32-NEXT: sub a3, a3, a4 ; RV32-NEXT: beq a3, a1, .LBB43_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a1, a3 @@ -2230,8 +2230,8 @@ ; RV32ZBA-LABEL: usubo.not.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sltu a4, a0, a2 -; RV32ZBA-NEXT: add a3, a3, a4 ; RV32ZBA-NEXT: sub a3, a1, a3 +; RV32ZBA-NEXT: sub a3, a3, a4 ; RV32ZBA-NEXT: beq a3, a1, .LBB43_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a1, a3 @@ -2377,21 +2377,21 @@ ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 +; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 -; RV32-NEXT: mulhu a7, a2, t1 -; RV32-NEXT: mul t0, a3, t1 -; RV32-NEXT: add t0, t2, t0 -; RV32-NEXT: add a7, a7, t0 -; RV32-NEXT: mul t0, t3, a1 -; RV32-NEXT: mulhu t1, t3, a0 -; RV32-NEXT: add t0, t0, t4 +; RV32-NEXT: mulhu a6, a2, t1 +; RV32-NEXT: add a6, a6, t2 +; RV32-NEXT: mul a7, a3, t1 +; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: mul a7, t3, a1 +; RV32-NEXT: mulhu t0, t3, a0 ; RV32-NEXT: add a7, t0, a7 -; RV32-NEXT: sltu t0, t5, t4 -; RV32-NEXT: add a7, a7, t0 -; RV32-NEXT: add a7, t1, a7 -; RV32-NEXT: add a5, a5, a7 +; RV32-NEXT: add a7, a7, t4 +; RV32-NEXT: add a6, a7, a6 +; RV32-NEXT: sltu a7, t5, t4 +; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: add a5, a5, a6 ; RV32-NEXT: add a5, a5, s0 -; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: srai a4, a4, 31 ; RV32-NEXT: xor a5, a5, a4 ; RV32-NEXT: xor a4, t6, a4 @@ -2446,21 +2446,21 @@ ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 +; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 -; RV32ZBA-NEXT: mulhu a7, a2, t1 -; RV32ZBA-NEXT: mul t0, a3, t1 -; RV32ZBA-NEXT: add t0, t2, t0 -; RV32ZBA-NEXT: add a7, a7, t0 -; RV32ZBA-NEXT: mul t0, t3, a1 -; RV32ZBA-NEXT: mulhu t1, t3, a0 -; RV32ZBA-NEXT: add t0, t0, t4 +; RV32ZBA-NEXT: mulhu a6, a2, t1 +; RV32ZBA-NEXT: add a6, a6, t2 +; RV32ZBA-NEXT: mul a7, a3, t1 +; RV32ZBA-NEXT: add a6, a6, a7 +; RV32ZBA-NEXT: mul a7, t3, a1 +; RV32ZBA-NEXT: mulhu t0, t3, a0 ; RV32ZBA-NEXT: add a7, t0, a7 -; RV32ZBA-NEXT: sltu t0, t5, t4 -; RV32ZBA-NEXT: add a7, a7, t0 -; RV32ZBA-NEXT: add a7, t1, a7 -; RV32ZBA-NEXT: add a5, a5, a7 +; RV32ZBA-NEXT: add a7, a7, t4 +; RV32ZBA-NEXT: add a6, a7, a6 +; RV32ZBA-NEXT: sltu a7, t5, t4 +; RV32ZBA-NEXT: add a6, a6, a7 +; RV32ZBA-NEXT: add a5, a5, a6 ; RV32ZBA-NEXT: add a5, a5, s0 -; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: srai a4, a4, 31 ; RV32ZBA-NEXT: xor a5, a5, a4 ; RV32ZBA-NEXT: xor a4, t6, a4 @@ -2522,21 +2522,21 @@ ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 +; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a2, a2, t1 +; RV32-NEXT: add a2, a2, t2 ; RV32-NEXT: mul a3, a3, t1 -; RV32-NEXT: add a3, t2, a3 ; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: mul a1, t3, a1 ; RV32-NEXT: mulhu a0, t3, a0 -; RV32-NEXT: add a1, a1, t4 -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: sltu a2, t5, t4 -; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a0, a0, t4 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: sltu a1, t5, t4 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add a0, a5, a0 ; RV32-NEXT: add a0, a0, s0 -; RV32-NEXT: add a0, a6, a0 ; RV32-NEXT: srai a4, a4, 31 ; RV32-NEXT: xor a0, a0, a4 ; RV32-NEXT: xor a1, t6, a4 @@ -2585,21 +2585,21 @@ ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 +; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a2, a2, t1 +; RV32ZBA-NEXT: add a2, a2, t2 ; RV32ZBA-NEXT: mul a3, a3, t1 -; RV32ZBA-NEXT: add a3, t2, a3 ; RV32ZBA-NEXT: add a2, a2, a3 ; RV32ZBA-NEXT: mul a1, t3, a1 ; RV32ZBA-NEXT: mulhu a0, t3, a0 -; RV32ZBA-NEXT: add a1, a1, t4 -; RV32ZBA-NEXT: add a1, a1, a2 -; RV32ZBA-NEXT: sltu a2, t5, t4 -; RV32ZBA-NEXT: add a1, a1, a2 +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: add a0, a0, t4 +; RV32ZBA-NEXT: add a0, a0, a2 +; RV32ZBA-NEXT: sltu a1, t5, t4 ; RV32ZBA-NEXT: add a0, a0, a1 ; RV32ZBA-NEXT: add a0, a5, a0 ; RV32ZBA-NEXT: add a0, a0, s0 -; RV32ZBA-NEXT: add a0, a6, a0 ; RV32ZBA-NEXT: srai a4, a4, 31 ; RV32ZBA-NEXT: xor a0, a0, a4 ; RV32ZBA-NEXT: xor a1, t6, a4 @@ -2715,19 +2715,19 @@ ; RV32: # %bb.0: # %entry ; RV32-NEXT: mul a4, a3, a0 ; RV32-NEXT: mul a5, a1, a2 -; RV32-NEXT: mulhu a6, a0, a2 -; RV32-NEXT: add a4, a6, a4 -; RV32-NEXT: add a4, a4, a5 -; RV32-NEXT: sltu a4, a4, a6 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: mulhu a5, a0, a2 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: sltu a4, a4, a5 ; RV32-NEXT: snez a5, a3 ; RV32-NEXT: snez a6, a1 ; RV32-NEXT: and a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a2 ; RV32-NEXT: snez a6, a6 -; RV32-NEXT: mulhu a7, a3, a0 -; RV32-NEXT: snez a7, a7 -; RV32-NEXT: or a6, a6, a7 -; RV32-NEXT: or a4, a6, a4 +; RV32-NEXT: or a5, a5, a6 +; RV32-NEXT: mulhu a6, a3, a0 +; RV32-NEXT: snez a6, a6 +; RV32-NEXT: or a5, a5, a6 ; RV32-NEXT: or a4, a5, a4 ; RV32-NEXT: bnez a4, .LBB50_2 ; RV32-NEXT: # %bb.1: # %entry @@ -2749,19 +2749,19 @@ ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mul a4, a3, a0 ; RV32ZBA-NEXT: mul a5, a1, a2 -; RV32ZBA-NEXT: mulhu a6, a0, a2 -; RV32ZBA-NEXT: add a4, a6, a4 -; RV32ZBA-NEXT: add a4, a4, a5 -; RV32ZBA-NEXT: sltu a4, a4, a6 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: mulhu a5, a0, a2 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: sltu a4, a4, a5 ; RV32ZBA-NEXT: snez a5, a3 ; RV32ZBA-NEXT: snez a6, a1 ; RV32ZBA-NEXT: and a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a2 ; RV32ZBA-NEXT: snez a6, a6 -; RV32ZBA-NEXT: mulhu a7, a3, a0 -; RV32ZBA-NEXT: snez a7, a7 -; RV32ZBA-NEXT: or a6, a6, a7 -; RV32ZBA-NEXT: or a4, a6, a4 +; RV32ZBA-NEXT: or a5, a5, a6 +; RV32ZBA-NEXT: mulhu a6, a3, a0 +; RV32ZBA-NEXT: snez a6, a6 +; RV32ZBA-NEXT: or a5, a5, a6 ; RV32ZBA-NEXT: or a4, a5, a4 ; RV32ZBA-NEXT: bnez a4, .LBB50_2 ; RV32ZBA-NEXT: # %bb.1: # %entry @@ -2790,20 +2790,20 @@ ; RV32: # %bb.0: # %entry ; RV32-NEXT: mul a4, a3, a0 ; RV32-NEXT: mul a5, a1, a2 -; RV32-NEXT: mulhu a6, a0, a2 -; RV32-NEXT: add a4, a6, a4 -; RV32-NEXT: add a4, a4, a5 -; RV32-NEXT: sltu a4, a4, a6 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: mulhu a5, a0, a2 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: sltu a4, a4, a5 ; RV32-NEXT: snez a5, a3 ; RV32-NEXT: snez a6, a1 ; RV32-NEXT: and a5, a6, a5 ; RV32-NEXT: mulhu a1, a1, a2 ; RV32-NEXT: snez a1, a1 +; RV32-NEXT: or a1, a5, a1 ; RV32-NEXT: mulhu a0, a3, a0 ; RV32-NEXT: snez a0, a0 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: or a0, a0, a4 -; RV32-NEXT: or a0, a5, a0 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: ret ; @@ -2817,20 +2817,20 @@ ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mul a4, a3, a0 ; RV32ZBA-NEXT: mul a5, a1, a2 -; RV32ZBA-NEXT: mulhu a6, a0, a2 -; RV32ZBA-NEXT: add a4, a6, a4 -; RV32ZBA-NEXT: add a4, a4, a5 -; RV32ZBA-NEXT: sltu a4, a4, a6 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: mulhu a5, a0, a2 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: sltu a4, a4, a5 ; RV32ZBA-NEXT: snez a5, a3 ; RV32ZBA-NEXT: snez a6, a1 ; RV32ZBA-NEXT: and a5, a6, a5 ; RV32ZBA-NEXT: mulhu a1, a1, a2 ; RV32ZBA-NEXT: snez a1, a1 +; RV32ZBA-NEXT: or a1, a5, a1 ; RV32ZBA-NEXT: mulhu a0, a3, a0 ; RV32ZBA-NEXT: snez a0, a0 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: or a0, a0, a4 -; RV32ZBA-NEXT: or a0, a5, a0 ; RV32ZBA-NEXT: xori a0, a0, 1 ; RV32ZBA-NEXT: ret ; @@ -2916,10 +2916,10 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { ; RV32-LABEL: saddo.br.i64: ; RV32: # %bb.0: # %entry +; RV32-NEXT: add a4, a1, a3 ; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: sltu a0, a2, a0 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: add a0, a4, a0 ; RV32-NEXT: xor a0, a1, a0 ; RV32-NEXT: xor a1, a1, a3 ; RV32-NEXT: not a1, a1 @@ -2947,10 +2947,10 @@ ; ; RV32ZBA-LABEL: saddo.br.i64: ; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a4, a1, a3 ; RV32ZBA-NEXT: add a2, a0, a2 ; RV32ZBA-NEXT: sltu a0, a2, a0 -; RV32ZBA-NEXT: add a0, a3, a0 -; RV32ZBA-NEXT: add a0, a1, a0 +; RV32ZBA-NEXT: add a0, a4, a0 ; RV32ZBA-NEXT: xor a0, a1, a0 ; RV32ZBA-NEXT: xor a1, a1, a3 ; RV32ZBA-NEXT: not a1, a1 @@ -3050,10 +3050,10 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) { ; RV32-LABEL: uaddo.br.i64: ; RV32: # %bb.0: # %entry +; RV32-NEXT: add a3, a1, a3 ; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: add a2, a3, a0 -; RV32-NEXT: add a2, a1, a2 ; RV32-NEXT: beq a2, a1, .LBB55_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a2, a1 @@ -3079,10 +3079,10 @@ ; ; RV32ZBA-LABEL: uaddo.br.i64: ; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a3, a1, a3 ; RV32ZBA-NEXT: add a2, a0, a2 ; RV32ZBA-NEXT: sltu a0, a2, a0 ; RV32ZBA-NEXT: add a2, a3, a0 -; RV32ZBA-NEXT: add a2, a1, a2 ; RV32ZBA-NEXT: beq a2, a1, .LBB55_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a2, a1 @@ -3185,12 +3185,12 @@ ; RV32-LABEL: ssubo.br.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: sltu a0, a0, a2 -; RV32-NEXT: add a0, a3, a0 -; RV32-NEXT: sub a0, a1, a0 -; RV32-NEXT: xor a0, a1, a0 +; RV32-NEXT: sub a2, a1, a3 +; RV32-NEXT: sub a2, a2, a0 +; RV32-NEXT: xor a2, a1, a2 ; RV32-NEXT: xor a1, a1, a3 -; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: bgez a0, .LBB57_2 +; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: bgez a1, .LBB57_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: li a0, 0 ; RV32-NEXT: ret @@ -3214,12 +3214,12 @@ ; RV32ZBA-LABEL: ssubo.br.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sltu a0, a0, a2 -; RV32ZBA-NEXT: add a0, a3, a0 -; RV32ZBA-NEXT: sub a0, a1, a0 -; RV32ZBA-NEXT: xor a0, a1, a0 +; RV32ZBA-NEXT: sub a2, a1, a3 +; RV32ZBA-NEXT: sub a2, a2, a0 +; RV32ZBA-NEXT: xor a2, a1, a2 ; RV32ZBA-NEXT: xor a1, a1, a3 -; RV32ZBA-NEXT: and a0, a1, a0 -; RV32ZBA-NEXT: bgez a0, .LBB57_2 +; RV32ZBA-NEXT: and a1, a1, a2 +; RV32ZBA-NEXT: bgez a1, .LBB57_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: li a0, 0 ; RV32ZBA-NEXT: ret @@ -3313,8 +3313,8 @@ ; RV32-LABEL: usubo.br.i64: ; RV32: # %bb.0: # %entry ; RV32-NEXT: sltu a4, a0, a2 -; RV32-NEXT: add a3, a3, a4 ; RV32-NEXT: sub a3, a1, a3 +; RV32-NEXT: sub a3, a3, a4 ; RV32-NEXT: beq a3, a1, .LBB59_3 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a1, a3 @@ -3344,8 +3344,8 @@ ; RV32ZBA-LABEL: usubo.br.i64: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sltu a4, a0, a2 -; RV32ZBA-NEXT: add a3, a3, a4 ; RV32ZBA-NEXT: sub a3, a1, a3 +; RV32ZBA-NEXT: sub a3, a3, a4 ; RV32ZBA-NEXT: beq a3, a1, .LBB59_3 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a1, a3 @@ -3478,21 +3478,21 @@ ; RV32-NEXT: sltu a7, t0, a7 ; RV32-NEXT: sltu a5, a6, a5 ; RV32-NEXT: mulhu a6, a1, a3 +; RV32-NEXT: add a5, a6, a5 ; RV32-NEXT: add a5, a5, a7 ; RV32-NEXT: mulhu a2, a2, t1 +; RV32-NEXT: add a2, a2, t2 ; RV32-NEXT: mul a3, a3, t1 -; RV32-NEXT: add a3, t2, a3 ; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: mul a1, t3, a1 ; RV32-NEXT: mulhu a0, t3, a0 -; RV32-NEXT: add a1, a1, t4 -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: sltu a2, t5, t4 -; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add a0, a0, t4 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: sltu a1, t5, t4 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add a0, a5, a0 ; RV32-NEXT: add a0, a0, s0 -; RV32-NEXT: add a0, a6, a0 ; RV32-NEXT: srai a4, a4, 31 ; RV32-NEXT: xor a0, a0, a4 ; RV32-NEXT: xor a1, t6, a4 @@ -3551,21 +3551,21 @@ ; RV32ZBA-NEXT: sltu a7, t0, a7 ; RV32ZBA-NEXT: sltu a5, a6, a5 ; RV32ZBA-NEXT: mulhu a6, a1, a3 +; RV32ZBA-NEXT: add a5, a6, a5 ; RV32ZBA-NEXT: add a5, a5, a7 ; RV32ZBA-NEXT: mulhu a2, a2, t1 +; RV32ZBA-NEXT: add a2, a2, t2 ; RV32ZBA-NEXT: mul a3, a3, t1 -; RV32ZBA-NEXT: add a3, t2, a3 ; RV32ZBA-NEXT: add a2, a2, a3 ; RV32ZBA-NEXT: mul a1, t3, a1 ; RV32ZBA-NEXT: mulhu a0, t3, a0 -; RV32ZBA-NEXT: add a1, a1, t4 -; RV32ZBA-NEXT: add a1, a1, a2 -; RV32ZBA-NEXT: sltu a2, t5, t4 -; RV32ZBA-NEXT: add a1, a1, a2 +; RV32ZBA-NEXT: add a0, a0, a1 +; RV32ZBA-NEXT: add a0, a0, t4 +; RV32ZBA-NEXT: add a0, a0, a2 +; RV32ZBA-NEXT: sltu a1, t5, t4 ; RV32ZBA-NEXT: add a0, a0, a1 ; RV32ZBA-NEXT: add a0, a5, a0 ; RV32ZBA-NEXT: add a0, a0, s0 -; RV32ZBA-NEXT: add a0, a6, a0 ; RV32ZBA-NEXT: srai a4, a4, 31 ; RV32ZBA-NEXT: xor a0, a0, a4 ; RV32ZBA-NEXT: xor a1, t6, a4 @@ -3633,16 +3633,16 @@ ; RV32-NEXT: sltu t1, t1, t6 ; RV32-NEXT: sltu a4, a6, a4 ; RV32-NEXT: mulhu a6, a1, a7 +; RV32-NEXT: add a4, a6, a4 ; RV32-NEXT: add a4, a4, t1 ; RV32-NEXT: sltu a5, t3, a5 ; RV32-NEXT: mulh a2, t2, a2 ; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: sub a0, a0, a2 -; RV32-NEXT: sub a0, a0, a5 ; RV32-NEXT: sub a0, t0, a0 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a0, a0, a5 ; RV32-NEXT: add a0, a4, a0 ; RV32-NEXT: add a0, a0, t5 -; RV32-NEXT: add a0, a6, a0 ; RV32-NEXT: srai a3, a3, 31 ; RV32-NEXT: xor a0, a0, a3 ; RV32-NEXT: xor a1, t4, a3 @@ -3695,16 +3695,16 @@ ; RV32ZBA-NEXT: sltu t1, t1, t6 ; RV32ZBA-NEXT: sltu a4, a6, a4 ; RV32ZBA-NEXT: mulhu a6, a1, a7 +; RV32ZBA-NEXT: add a4, a6, a4 ; RV32ZBA-NEXT: add a4, a4, t1 ; RV32ZBA-NEXT: sltu a5, t3, a5 ; RV32ZBA-NEXT: mulh a2, t2, a2 ; RV32ZBA-NEXT: add a0, a0, a1 -; RV32ZBA-NEXT: sub a0, a0, a2 -; RV32ZBA-NEXT: sub a0, a0, a5 ; RV32ZBA-NEXT: sub a0, t0, a0 +; RV32ZBA-NEXT: add a0, a0, a2 +; RV32ZBA-NEXT: add a0, a0, a5 ; RV32ZBA-NEXT: add a0, a4, a0 ; RV32ZBA-NEXT: add a0, a0, t5 -; RV32ZBA-NEXT: add a0, a6, a0 ; RV32ZBA-NEXT: srai a3, a3, 31 ; RV32ZBA-NEXT: xor a0, a0, a3 ; RV32ZBA-NEXT: xor a1, t4, a3 @@ -3811,20 +3811,20 @@ ; RV32: # %bb.0: # %entry ; RV32-NEXT: mul a4, a3, a0 ; RV32-NEXT: mul a5, a1, a2 -; RV32-NEXT: mulhu a6, a0, a2 -; RV32-NEXT: add a4, a6, a4 -; RV32-NEXT: add a4, a4, a5 -; RV32-NEXT: sltu a4, a4, a6 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: mulhu a5, a0, a2 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: sltu a4, a4, a5 ; RV32-NEXT: snez a5, a3 ; RV32-NEXT: snez a6, a1 ; RV32-NEXT: and a5, a6, a5 ; RV32-NEXT: mulhu a1, a1, a2 ; RV32-NEXT: snez a1, a1 +; RV32-NEXT: or a1, a5, a1 ; RV32-NEXT: mulhu a0, a3, a0 ; RV32-NEXT: snez a0, a0 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: or a0, a0, a4 -; RV32-NEXT: or a0, a5, a0 ; RV32-NEXT: beqz a0, .LBB64_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: li a0, 0 @@ -3848,20 +3848,20 @@ ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mul a4, a3, a0 ; RV32ZBA-NEXT: mul a5, a1, a2 -; RV32ZBA-NEXT: mulhu a6, a0, a2 -; RV32ZBA-NEXT: add a4, a6, a4 -; RV32ZBA-NEXT: add a4, a4, a5 -; RV32ZBA-NEXT: sltu a4, a4, a6 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: mulhu a5, a0, a2 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: sltu a4, a4, a5 ; RV32ZBA-NEXT: snez a5, a3 ; RV32ZBA-NEXT: snez a6, a1 ; RV32ZBA-NEXT: and a5, a6, a5 ; RV32ZBA-NEXT: mulhu a1, a1, a2 ; RV32ZBA-NEXT: snez a1, a1 +; RV32ZBA-NEXT: or a1, a5, a1 ; RV32ZBA-NEXT: mulhu a0, a3, a0 ; RV32ZBA-NEXT: snez a0, a0 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: or a0, a0, a4 -; RV32ZBA-NEXT: or a0, a5, a0 ; RV32ZBA-NEXT: beqz a0, .LBB64_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: li a0, 0 @@ -3898,8 +3898,8 @@ ; RV32: # %bb.0: # %entry ; RV32-NEXT: add a2, a0, a0 ; RV32-NEXT: sltu a0, a2, a0 -; RV32-NEXT: add a2, a1, a0 -; RV32-NEXT: add a2, a1, a2 +; RV32-NEXT: add a2, a1, a1 +; RV32-NEXT: add a2, a2, a0 ; RV32-NEXT: beq a2, a1, .LBB65_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a2, a1 @@ -3927,8 +3927,8 @@ ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: add a2, a0, a0 ; RV32ZBA-NEXT: sltu a0, a2, a0 -; RV32ZBA-NEXT: add a2, a1, a0 -; RV32ZBA-NEXT: add a2, a1, a2 +; RV32ZBA-NEXT: add a2, a1, a1 +; RV32ZBA-NEXT: add a2, a2, a0 ; RV32ZBA-NEXT: beq a2, a1, .LBB65_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a2, a1 diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll --- a/llvm/test/CodeGen/X86/abdu-vector-128.ll +++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll @@ -1498,9 +1498,10 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psubusw %xmm1, %xmm2 ; SSE2-NEXT: psubusw %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: psubw %xmm0, %xmm2 ; SSE2-NEXT: paddw %xmm1, %xmm2 -; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: movdqa %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: abd_minmax_v8i16: diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll --- a/llvm/test/CodeGen/X86/add-sub-bool.ll +++ b/llvm/test/CodeGen/X86/add-sub-bool.ll @@ -68,11 +68,11 @@ ; ; X64-LABEL: test_i32_add_add_idx0: ; X64: # %bb.0: -; X64-NEXT: # kill: def $edx killed $edx def $rdx +; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal (%rdi,%rsi), %eax ; X64-NEXT: andl $1, %edx -; X64-NEXT: leal (%rdx,%rdi), %eax -; X64-NEXT: addl %esi, %eax +; X64-NEXT: addl %edx, %eax ; X64-NEXT: retq %add = add i32 %y, %x %mask = and i32 %z, 1 diff --git a/llvm/test/CodeGen/X86/alias-static-alloca.ll b/llvm/test/CodeGen/X86/alias-static-alloca.ll --- a/llvm/test/CodeGen/X86/alias-static-alloca.ll +++ b/llvm/test/CodeGen/X86/alias-static-alloca.ll @@ -7,15 +7,17 @@ define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $ecx killed $ecx def $rcx ; CHECK-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: # kill: def $edi killed $edi def $rdi ; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: leal (%rsi,%rdx), %eax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: addl %edi, %esi +; CHECK-NEXT: leal (%rdx,%rcx), %eax +; CHECK-NEXT: addl %esi, %eax ; CHECK-NEXT: retq entry: %a0 = alloca i32 diff --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll --- a/llvm/test/CodeGen/X86/avx-vinsertf128.ll +++ b/llvm/test/CodeGen/X86/avx-vinsertf128.ll @@ -59,13 +59,13 @@ define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly { ; CHECK-LABEL: DAGCombineB: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %t1 = add <8 x i32> %v1, %v2 %t2 = add <8 x i32> %t1, %v1 diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -316,7 +316,7 @@ ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu64 8192(%rdi,%rax), %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB9_1 @@ -351,7 +351,7 @@ ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %ymm0 ; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1 -; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: vmovdqu %ymm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB10_1 @@ -386,7 +386,7 @@ ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovdqu 8192(%rdi,%rax), %xmm0 ; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovdqu %xmm0, 8192(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB11_1 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-x86_64.ll @@ -117,10 +117,10 @@ define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64: ; CHECK: ## %bb.0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2usi %xmm0, %rax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: retq @@ -136,10 +136,10 @@ define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si64: ; CHECK: ## %bb.0: -; CHECK-NEXT: vcvtsd2si %xmm0, %rcx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2si %xmm0, %rax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: retq @@ -155,10 +155,10 @@ define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi64: ; CHECK: ## %bb.0: -; CHECK-NEXT: vcvtss2usi %xmm0, %rcx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2usi %xmm0, %rax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: retq @@ -174,10 +174,10 @@ define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si64: ; CHECK: ## %bb.0: -; CHECK-NEXT: vcvtss2si %xmm0, %rcx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2si %xmm0, %rax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -934,10 +934,10 @@ define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2usi %xmm0, %eax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: ret{{[l|q]}} @@ -953,10 +953,10 @@ define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si32: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtsd2si %xmm0, %ecx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2si %xmm0, %eax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: ret{{[l|q]}} @@ -972,10 +972,10 @@ define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtss2usi %xmm0, %ecx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2usi %xmm0, %eax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: ret{{[l|q]}} @@ -991,10 +991,10 @@ define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si32: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtss2si %xmm0, %ecx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2si %xmm0, %eax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: ret{{[l|q]}} @@ -3220,9 +3220,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovqb %zmm0, %xmm2 ; X64-NEXT: vpmovqb %zmm0, %xmm1 {%k1} +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X64-NEXT: vpmovqb %zmm0, %xmm0 {%k1} {z} ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -3232,9 +3232,9 @@ ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovqb %zmm0, %xmm2 ; X86-NEXT: vpmovqb %zmm0, %xmm1 {%k1} +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X86-NEXT: vpmovqb %zmm0, %xmm0 {%k1} {z} ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) @@ -3278,9 +3278,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovsqb %zmm0, %xmm2 ; X64-NEXT: vpmovsqb %zmm0, %xmm1 {%k1} +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X64-NEXT: vpmovsqb %zmm0, %xmm0 {%k1} {z} ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -3290,9 +3290,9 @@ ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovsqb %zmm0, %xmm2 ; X86-NEXT: vpmovsqb %zmm0, %xmm1 {%k1} +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X86-NEXT: vpmovsqb %zmm0, %xmm0 {%k1} {z} ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) @@ -3336,9 +3336,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovusqb %zmm0, %xmm2 ; X64-NEXT: vpmovusqb %zmm0, %xmm1 {%k1} +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X64-NEXT: vpmovusqb %zmm0, %xmm0 {%k1} {z} ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -3348,9 +3348,9 @@ ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovusqb %zmm0, %xmm2 ; X86-NEXT: vpmovusqb %zmm0, %xmm1 {%k1} +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X86-NEXT: vpmovusqb %zmm0, %xmm0 {%k1} {z} ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1) @@ -3394,9 +3394,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovqw %zmm0, %xmm2 ; X64-NEXT: vpmovqw %zmm0, %xmm1 {%k1} +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z} ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -3406,9 +3406,9 @@ ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovqw %zmm0, %xmm2 ; X86-NEXT: vpmovqw %zmm0, %xmm1 {%k1} +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z} ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) @@ -3452,9 +3452,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovsqw %zmm0, %xmm2 ; X64-NEXT: vpmovsqw %zmm0, %xmm1 {%k1} +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; X64-NEXT: vpmovsqw %zmm0, %xmm0 {%k1} {z} ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -3464,9 +3464,9 @@ ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovsqw %zmm0, %xmm2 ; X86-NEXT: vpmovsqw %zmm0, %xmm1 {%k1} +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; X86-NEXT: vpmovsqw %zmm0, %xmm0 {%k1} {z} ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) @@ -3510,9 +3510,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovusqw %zmm0, %xmm2 ; X64-NEXT: vpmovusqw %zmm0, %xmm1 {%k1} +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; X64-NEXT: vpmovusqw %zmm0, %xmm0 {%k1} {z} ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -3522,9 +3522,9 @@ ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovusqw %zmm0, %xmm2 ; X86-NEXT: vpmovusqw %zmm0, %xmm1 {%k1} +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ; X86-NEXT: vpmovusqw %zmm0, %xmm0 {%k1} {z} ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1) @@ -3786,9 +3786,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovdb %zmm0, %xmm2 ; X64-NEXT: vpmovdb %zmm0, %xmm1 {%k1} +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z} ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -3797,9 +3797,9 @@ ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpmovdb %zmm0, %xmm2 ; X86-NEXT: vpmovdb %zmm0, %xmm1 {%k1} +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z} ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) @@ -3842,9 +3842,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovsdb %zmm0, %xmm2 ; X64-NEXT: vpmovsdb %zmm0, %xmm1 {%k1} +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X64-NEXT: vpmovsdb %zmm0, %xmm0 {%k1} {z} ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -3853,9 +3853,9 @@ ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpmovsdb %zmm0, %xmm2 ; X86-NEXT: vpmovsdb %zmm0, %xmm1 {%k1} +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X86-NEXT: vpmovsdb %zmm0, %xmm0 {%k1} {z} ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) @@ -3898,9 +3898,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovusdb %zmm0, %xmm2 ; X64-NEXT: vpmovusdb %zmm0, %xmm1 {%k1} +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X64-NEXT: vpmovusdb %zmm0, %xmm0 {%k1} {z} ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -3909,9 +3909,9 @@ ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpmovusdb %zmm0, %xmm2 ; X86-NEXT: vpmovusdb %zmm0, %xmm1 {%k1} +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ; X86-NEXT: vpmovusdb %zmm0, %xmm0 {%k1} {z} ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1) @@ -3954,9 +3954,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovdw %zmm0, %ymm2 ; X64-NEXT: vpmovdw %zmm0, %ymm1 {%k1} +; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X64-NEXT: vpmovdw %zmm0, %ymm0 {%k1} {z} ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pmov_dw_512: @@ -3964,9 +3964,9 @@ ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpmovdw %zmm0, %ymm2 ; X86-NEXT: vpmovdw %zmm0, %ymm1 {%k1} +; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X86-NEXT: vpmovdw %zmm0, %ymm0 {%k1} {z} ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ; X86-NEXT: retl %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) @@ -4008,9 +4008,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovsdw %zmm0, %ymm2 ; X64-NEXT: vpmovsdw %zmm0, %ymm1 {%k1} +; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X64-NEXT: vpmovsdw %zmm0, %ymm0 {%k1} {z} ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pmovs_dw_512: @@ -4018,9 +4018,9 @@ ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpmovsdw %zmm0, %ymm2 ; X86-NEXT: vpmovsdw %zmm0, %ymm1 {%k1} +; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X86-NEXT: vpmovsdw %zmm0, %ymm0 {%k1} {z} ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ; X86-NEXT: retl %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) @@ -4062,9 +4062,9 @@ ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovusdw %zmm0, %ymm2 ; X64-NEXT: vpmovusdw %zmm0, %ymm1 {%k1} +; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X64-NEXT: vpmovusdw %zmm0, %ymm0 {%k1} {z} ; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pmovus_dw_512: @@ -4072,9 +4072,9 @@ ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpmovusdw %zmm0, %ymm2 ; X86-NEXT: vpmovusdw %zmm0, %ymm1 {%k1} +; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X86-NEXT: vpmovusdw %zmm0, %ymm0 {%k1} {z} ; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ; X86-NEXT: retl %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1) %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) @@ -4548,9 +4548,9 @@ ; X64-NEXT: kmovw %k0, %esi ; X64-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: orl %ecx, %edx ; X64-NEXT: orl %esi, %eax ; X64-NEXT: orl %edx, %eax -; X64-NEXT: orl %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -4569,9 +4569,9 @@ ; X86-NEXT: kmovw %k0, %esi ; X86-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax +; X86-NEXT: orl %ecx, %edx ; X86-NEXT: orl %esi, %eax ; X86-NEXT: orl %edx, %eax -; X86-NEXT: orl %ecx, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 @@ -4625,9 +4625,9 @@ ; X64-NEXT: kmovw %k0, %esi ; X64-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1} ; X64-NEXT: kmovw %k0, %eax +; X64-NEXT: andl %ecx, %edx ; X64-NEXT: andl %esi, %eax ; X64-NEXT: andl %edx, %eax -; X64-NEXT: andl %ecx, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -4646,9 +4646,9 @@ ; X86-NEXT: kmovw %k0, %esi ; X86-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax +; X86-NEXT: andl %ecx, %edx ; X86-NEXT: andl %esi, %eax ; X86-NEXT: andl %edx, %eax -; X86-NEXT: andl %ecx, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1239,11 +1239,11 @@ ; X86-NEXT: kshiftlq $6, %k1, %k1 ; X86-NEXT: kshiftlq $59, %k0, %k0 ; X86-NEXT: kshiftrq $59, %k0, %k0 +; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: movb $1, %al -; X86-NEXT: kmovd %eax, %k2 -; X86-NEXT: kshiftlq $63, %k2, %k2 -; X86-NEXT: kshiftrq $58, %k2, %k2 -; X86-NEXT: korq %k1, %k2, %k1 +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlq $63, %k1, %k1 +; X86-NEXT: kshiftrq $58, %k1, %k1 ; X86-NEXT: korq %k0, %k1, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl @@ -1361,10 +1361,10 @@ ; X86-NEXT: kshiftlq $6, %k1, %k1 ; X86-NEXT: kshiftlq $59, %k0, %k0 ; X86-NEXT: kshiftrq $59, %k0, %k0 -; X86-NEXT: kmovd %eax, %k2 -; X86-NEXT: kshiftlq $63, %k2, %k2 -; X86-NEXT: kshiftrq $58, %k2, %k2 -; X86-NEXT: korq %k1, %k2, %k1 +; X86-NEXT: korq %k1, %k0, %k0 +; X86-NEXT: kmovd %eax, %k1 +; X86-NEXT: kshiftlq $63, %k1, %k1 +; X86-NEXT: kshiftrq $58, %k1, %k1 ; X86-NEXT: korq %k0, %k1, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll --- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -35,29 +35,29 @@ ; ; WIN64-LABEL: test_argv64i1: ; WIN64: # %bb.0: -; WIN64-NEXT: addq %rdx, %rcx -; WIN64-NEXT: addq %rdi, %rcx -; WIN64-NEXT: addq %rsi, %rcx -; WIN64-NEXT: addq %r8, %rcx +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: addq %rdx, %rax +; WIN64-NEXT: addq %rdi, %rax +; WIN64-NEXT: leaq (%rsi,%r8), %rcx ; WIN64-NEXT: addq %r9, %rcx -; WIN64-NEXT: addq %r10, %rcx -; WIN64-NEXT: addq %r11, %rcx +; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: leaq (%r10,%r11), %rcx ; WIN64-NEXT: addq %r12, %rcx ; WIN64-NEXT: addq %r14, %rcx -; WIN64-NEXT: addq %r15, %rcx ; WIN64-NEXT: addq %rcx, %rax +; WIN64-NEXT: addq %r15, %rax ; WIN64-NEXT: addq {{[0-9]+}}(%rsp), %rax ; WIN64-NEXT: retq ; ; LINUXOSX64-LABEL: test_argv64i1: ; LINUXOSX64: # %bb.0: -; LINUXOSX64-NEXT: addq %rdx, %rcx -; LINUXOSX64-NEXT: addq %rdi, %rcx -; LINUXOSX64-NEXT: addq %rsi, %rcx -; LINUXOSX64-NEXT: addq %r8, %rcx +; LINUXOSX64-NEXT: addq %rcx, %rax +; LINUXOSX64-NEXT: addq %rdx, %rax +; LINUXOSX64-NEXT: addq %rdi, %rax +; LINUXOSX64-NEXT: leaq (%rsi,%r8), %rcx ; LINUXOSX64-NEXT: addq %r9, %rcx -; LINUXOSX64-NEXT: addq %r12, %rcx -; LINUXOSX64-NEXT: addq %r13, %rcx +; LINUXOSX64-NEXT: addq %rcx, %rax +; LINUXOSX64-NEXT: leaq (%r12,%r13), %rcx ; LINUXOSX64-NEXT: addq %r14, %rcx ; LINUXOSX64-NEXT: addq %r15, %rcx ; LINUXOSX64-NEXT: addq %rcx, %rax diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -939,7 +939,7 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %ebp ; X32-NEXT: pushl %ebx -; X32-NEXT: subl $16, %esp +; X32-NEXT: subl $12, %esp ; X32-NEXT: movl %esi, (%esp) # 4-byte Spill ; X32-NEXT: movl %edi, %esi ; X32-NEXT: movl %edx, %ebx @@ -950,37 +950,36 @@ ; X32-NEXT: subl %esi, %ebx ; X32-NEXT: movl %edi, %eax ; X32-NEXT: subl %ecx, %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl %ebp, %ecx ; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %esi, %edx -; X32-NEXT: subl {{[0-9]+}}(%esp), %edx -; X32-NEXT: imull %ebx, %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl %esi, %eax +; X32-NEXT: subl {{[0-9]+}}(%esp), %eax +; X32-NEXT: imull %ebx, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl (%esp), %ebx # 4-byte Reload -; X32-NEXT: subl %ebp, %ebx -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %eax, %ecx +; X32-NEXT: subl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: subl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: addl %edx, %ecx +; X32-NEXT: addl %eax, %ecx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: addl {{[0-9]+}}(%esp), %edx -; X32-NEXT: imull %edx, %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: imull %ebp, %edi ; X32-NEXT: addl {{[0-9]+}}(%esp), %esi ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: addl {{[0-9]+}}(%esp), %eax -; X32-NEXT: imull %ebp, %eax -; X32-NEXT: addl %esi, %eax -; X32-NEXT: addl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: addl {{[0-9]+}}(%esp), %edx +; X32-NEXT: imull %eax, %edx +; X32-NEXT: addl %edx, %edi ; X32-NEXT: addl %ecx, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: addl $16, %esp +; X32-NEXT: addl $12, %esp ; X32-NEXT: popl %ebx ; X32-NEXT: popl %ebp ; X32-NEXT: retl @@ -1014,18 +1013,18 @@ ; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11 ; WIN64-NEXT: subl %r12d, %r11d ; WIN64-NEXT: imull %edx, %r11d +; WIN64-NEXT: addl %r9d, %r11d ; WIN64-NEXT: leal (%r14,%r15), %edx -; WIN64-NEXT: # kill: def $r14d killed $r14d killed $r14 -; WIN64-NEXT: subl %r15d, %r14d -; WIN64-NEXT: imull %esi, %r14d -; WIN64-NEXT: addl %r11d, %r14d +; WIN64-NEXT: movl %r14d, %r9d +; WIN64-NEXT: subl %r15d, %r9d +; WIN64-NEXT: imull %esi, %r9d +; WIN64-NEXT: addl %r11d, %r9d ; WIN64-NEXT: addl %ecx, %eax ; WIN64-NEXT: imull %r8d, %eax ; WIN64-NEXT: imull %ebx, %r10d +; WIN64-NEXT: addl %r10d, %eax ; WIN64-NEXT: imull %edi, %edx -; WIN64-NEXT: addl %r10d, %edx ; WIN64-NEXT: addl %edx, %eax -; WIN64-NEXT: addl %r14d, %eax ; WIN64-NEXT: addl %r9d, %eax ; WIN64-NEXT: popq %rbx ; WIN64-NEXT: retq @@ -1055,19 +1054,19 @@ ; LINUXOSX64-NEXT: leal (%r13,%r14), %r11d ; LINUXOSX64-NEXT: movl %r13d, %r12d ; LINUXOSX64-NEXT: subl %r14d, %r12d -; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %r14d ; LINUXOSX64-NEXT: imull %edx, %r12d -; LINUXOSX64-NEXT: movl %r15d, %edx -; LINUXOSX64-NEXT: subl %r14d, %edx -; LINUXOSX64-NEXT: imull %esi, %edx -; LINUXOSX64-NEXT: addl %r12d, %edx +; LINUXOSX64-NEXT: movl {{[0-9]+}}(%rsp), %edx +; LINUXOSX64-NEXT: addl %r9d, %r12d +; LINUXOSX64-NEXT: movl %r15d, %r9d +; LINUXOSX64-NEXT: subl %edx, %r9d +; LINUXOSX64-NEXT: imull %esi, %r9d +; LINUXOSX64-NEXT: addl %r12d, %r9d ; LINUXOSX64-NEXT: addl %ecx, %eax ; LINUXOSX64-NEXT: imull %r8d, %eax ; LINUXOSX64-NEXT: imull %r10d, %r11d -; LINUXOSX64-NEXT: addl %r15d, %r14d -; LINUXOSX64-NEXT: imull %edi, %r14d -; LINUXOSX64-NEXT: addl %r11d, %r14d -; LINUXOSX64-NEXT: addl %r14d, %eax +; LINUXOSX64-NEXT: addl %r11d, %eax +; LINUXOSX64-NEXT: addl %r15d, %edx +; LINUXOSX64-NEXT: imull %edi, %edx ; LINUXOSX64-NEXT: addl %edx, %eax ; LINUXOSX64-NEXT: addl %r9d, %eax ; LINUXOSX64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1905,19 +1905,19 @@ ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0] ; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] +; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1] ; X64-NEXT: vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02] -; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] -; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04] -; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] -; X64-NEXT: addq %rdx, %rcx # encoding: [0x48,0x01,0xd1] -; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05] ; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] -; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] ; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] -; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1] +; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] +; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] -; X64-NEXT: leaq -1(%rax,%rdx), %rax # encoding: [0x48,0x8d,0x44,0x10,0xff] +; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] +; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1] +; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] +; X64-NEXT: leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) @@ -1987,23 +1987,23 @@ ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] -; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] -; X64-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0] +; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] +; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1] ; X64-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02] -; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] -; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] -; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] -; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] -; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] +; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] ; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] +; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] +; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] +; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] -; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] -; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8] ; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8] +; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8] +; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) @@ -2078,19 +2078,19 @@ ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01] ; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] +; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1] ; X64-NEXT: vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02] -; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] -; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] +; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04] -; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] -; X64-NEXT: addq %rdx, %rcx # encoding: [0x48,0x01,0xd1] -; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05] ; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] -; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] ; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] -; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06] +; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] +; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] -; X64-NEXT: leaq -1(%rax,%rdx), %rax # encoding: [0x48,0x8d,0x44,0x10,0xff] +; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] +; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06] +; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] +; X64-NEXT: leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1) @@ -2160,23 +2160,23 @@ ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1] -; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] -; X64-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] +; X64-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01] +; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] +; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1] ; X64-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02] -; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] -; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] -; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] -; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] -; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] +; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0] ; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2] +; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca] +; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] +; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06] ; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0] -; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] -; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8] ; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8] +; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8] +; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask) @@ -2206,19 +2206,19 @@ ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0] ; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X86-NEXT: vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: addl %edx, %ecx # encoding: [0x01,0xd1] -; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] ; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] -; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] +; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: leal -1(%eax,%edx), %eax # encoding: [0x8d,0x44,0x10,0xff] +; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2228,19 +2228,19 @@ ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X64-NEXT: vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02] -; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04] -; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X64-NEXT: addl %edx, %ecx # encoding: [0x01,0xd1] -; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05] ; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] ; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] -; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] +; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X64-NEXT: leal -1(%rax,%rdx), %eax # encoding: [0x8d,0x44,0x10,0xff] +; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1] +; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X64-NEXT: leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) @@ -2268,23 +2268,23 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] ; X86-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0] +; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X86-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] -; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] -; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05] +; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] +; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05] +; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] +; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: popl %esi # encoding: [0x5e] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2293,23 +2293,23 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1] -; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X64-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0] +; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X64-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02] -; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] -; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05] +; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] +; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05] +; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] ; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] +; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] +; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) @@ -2339,19 +2339,19 @@ ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01] ; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X86-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04] -; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X86-NEXT: addl %edx, %ecx # encoding: [0x01,0xd1] -; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05] ; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] ; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] -; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06] +; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: leal -1(%eax,%edx), %eax # encoding: [0x8d,0x44,0x10,0xff] +; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06] +; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X86-NEXT: leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2361,19 +2361,19 @@ ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X64-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02] -; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] ; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04] -; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X64-NEXT: addl %edx, %ecx # encoding: [0x01,0xd1] -; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05] ; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] ; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] -; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06] +; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X64-NEXT: leal -1(%rax,%rdx), %eax # encoding: [0x8d,0x44,0x10,0xff] +; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06] +; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X64-NEXT: leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1) @@ -2401,23 +2401,23 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] ; X86-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1] -; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X86-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X86-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01] +; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] +; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] ; X86-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02] -; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] -; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] -; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] -; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05] +; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6] +; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] +; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05] +; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06] ; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] -; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] ; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] +; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] +; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0] ; X86-NEXT: popl %esi # encoding: [0x5e] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -2426,23 +2426,23 @@ ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1] -; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] -; X64-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] +; X64-NEXT: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01] +; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] +; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1] ; X64-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02] -; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] -; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] -; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05] +; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04] ; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0] ; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2] +; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca] +; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05] +; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8] ; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06] ; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0] -; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] -; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] ; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8] +; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8] +; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask) diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll @@ -53,8 +53,8 @@ ; CHECK-NEXT: vmulph %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: vfmadd213ph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm0 ; CHECK-NEXT: vmulph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm2, %zmm2 +; CHECK-NEXT: vmulph %zmm2, %zmm1, %zmm1 ; CHECK-NEXT: vmulph %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vmulph %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %1 = call fast <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0) %2 = fdiv fast <32 x half> %a1, %1 @@ -697,9 +697,9 @@ ; CHECK-NEXT: kmovd %k0, %esi ; CHECK-NEXT: vcmpnltsh {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax +; CHECK-NEXT: andb %cl, %dl ; CHECK-NEXT: andb %sil, %al ; CHECK-NEXT: andb %dl, %al -; CHECK-NEXT: andb %cl, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 2, i8 -1, i32 4) diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -2094,10 +2094,10 @@ define <16 x i32> @pr52561(<16 x i32> %a, <16 x i32> %b) "min-legal-vector-width"="256" "prefer-vector-width"="256" nounwind { ; X64-LABEL: pr52561: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd {{.*#+}} ymm4 = [112,112,112,112,112,112,112,112] -; X64-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; X64-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] ; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; X64-NEXT: vpaddd %ymm4, %ymm3, %ymm2 ; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -2110,11 +2110,11 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-32, %esp ; X86-NEXT: subl $32, %esp +; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; X86-NEXT: vpaddd 8(%ebp), %ymm1, %ymm1 -; X86-NEXT: vpbroadcastd {{.*#+}} ymm3 = [112,112,112,112,112,112,112,112] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm2 +; X86-NEXT: vpbroadcastd {{.*#+}} ymm2 = [112,112,112,112,112,112,112,112] ; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 ; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; X86-NEXT: vmovsh %xmm0, %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll @@ -7,10 +7,10 @@ ; CHECK-NEXT: vpbroadcastq %rdi, %zmm3 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; CHECK-NEXT: vpmovsxdq %ymm2, %zmm2 -; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm4 -; CHECK-NEXT: vpaddq %zmm2, %zmm4, %zmm2 +; CHECK-NEXT: vpaddq %zmm2, %zmm2, %zmm2 +; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm2 ; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 -; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm3 +; CHECK-NEXT: vpaddq %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -1665,9 +1665,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc2] ; X86-NEXT: vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_qb_128: @@ -1675,9 +1675,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc2] ; X64-NEXT: vpmovqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) @@ -1719,9 +1719,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc2] ; X86-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovsqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qb_128: @@ -1729,9 +1729,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc2] ; X64-NEXT: vpmovsqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovsqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) @@ -1773,9 +1773,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc2] ; X86-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovusqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qb_128: @@ -1783,9 +1783,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusqb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc2] ; X64-NEXT: vpmovusqb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovusqb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.128(<2 x i64> %x0, <16 x i8> %x1, i8 %x2) @@ -1827,9 +1827,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc2] ; X86-NEXT: vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1838,9 +1838,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc2] ; X64-NEXT: vpmovqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) @@ -1885,9 +1885,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc2] ; X86-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovsqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1896,9 +1896,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc2] ; X64-NEXT: vpmovsqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovsqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) @@ -1943,9 +1943,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc2] ; X86-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovusqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -1954,9 +1954,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusqb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc2] ; X64-NEXT: vpmovusqb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovusqb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.256(<4 x i64> %x0, <16 x i8> %x1, i8 -1) @@ -2001,9 +2001,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc2] ; X86-NEXT: vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_qw_128: @@ -2011,9 +2011,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc2] ; X64-NEXT: vpmovqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) @@ -2055,9 +2055,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc2] ; X86-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovsqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qw_128: @@ -2065,9 +2065,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc2] ; X64-NEXT: vpmovsqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovsqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) @@ -2109,9 +2109,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc2] ; X86-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovusqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qw_128: @@ -2119,9 +2119,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusqw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc2] ; X64-NEXT: vpmovusqw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovusqw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.128(<2 x i64> %x0, <8 x i16> %x1, i8 %x2) @@ -2163,9 +2163,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc2] ; X86-NEXT: vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2174,9 +2174,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc2] ; X64-NEXT: vpmovqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) @@ -2221,9 +2221,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc2] ; X86-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovsqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2232,9 +2232,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc2] ; X64-NEXT: vpmovsqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovsqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) @@ -2279,9 +2279,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc2] ; X86-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovusqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2290,9 +2290,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusqw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc2] ; X64-NEXT: vpmovusqw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovusqw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.256(<4 x i64> %x0, <8 x i16> %x1, i8 -1) @@ -2337,9 +2337,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc2] ; X86-NEXT: vpmovqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1] +; X86-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] ; X86-NEXT: vpmovqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc0] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_128: @@ -2347,9 +2347,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc2] ; X64-NEXT: vpmovqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1] +; X64-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] ; X64-NEXT: vpmovqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc0] ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) @@ -2391,9 +2391,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc2] ; X86-NEXT: vpmovsqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1] +; X86-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] ; X86-NEXT: vpmovsqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc0] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_128: @@ -2401,9 +2401,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc2] ; X64-NEXT: vpmovsqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1] +; X64-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] ; X64-NEXT: vpmovsqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc0] ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) @@ -2445,9 +2445,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc2] ; X86-NEXT: vpmovusqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1] +; X86-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] ; X86-NEXT: vpmovusqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc0] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_128: @@ -2455,9 +2455,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusqd %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc2] ; X64-NEXT: vpmovusqd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1] +; X64-NEXT: vpaddd %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] ; X64-NEXT: vpmovusqd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc0] ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 -1) %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.128(<2 x i64> %x0, <4 x i32> %x1, i8 %x2) @@ -2734,9 +2734,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc2] ; X86-NEXT: vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_db_128: @@ -2744,9 +2744,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc2] ; X64-NEXT: vpmovdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) @@ -2788,9 +2788,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc2] ; X86-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovsdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_db_128: @@ -2798,9 +2798,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc2] ; X64-NEXT: vpmovsdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovsdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) @@ -2842,9 +2842,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc2] ; X86-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovusdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_db_128: @@ -2852,9 +2852,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusdb %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc2] ; X64-NEXT: vpmovusdb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovusdb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.128(<4 x i32> %x0, <16 x i8> %x1, i8 %x2) @@ -2896,9 +2896,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc2] ; X86-NEXT: vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2907,9 +2907,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc2] ; X64-NEXT: vpmovdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) @@ -2954,9 +2954,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc2] ; X86-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovsdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2965,9 +2965,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc2] ; X64-NEXT: vpmovsdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovsdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) @@ -3012,9 +3012,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc2] ; X86-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1] +; X86-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X86-NEXT: vpmovusdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc0] ; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3023,9 +3023,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusdb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc2] ; X64-NEXT: vpmovusdb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1] +; X64-NEXT: vpaddb %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfc,0xc9] ; X64-NEXT: vpmovusdb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc0] ; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.256(<8 x i32> %x0, <16 x i8> %x1, i8 -1) @@ -3070,9 +3070,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc2] ; X86-NEXT: vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_dw_128: @@ -3080,9 +3080,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc2] ; X64-NEXT: vpmovdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) @@ -3124,9 +3124,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc2] ; X86-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovsdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_dw_128: @@ -3134,9 +3134,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc2] ; X64-NEXT: vpmovsdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovsdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) @@ -3178,9 +3178,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc2] ; X86-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovusdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_dw_128: @@ -3188,9 +3188,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusdw %xmm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc2] ; X64-NEXT: vpmovusdw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovusdw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 -1) %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.128(<4 x i32> %x0, <8 x i16> %x1, i8 %x2) @@ -3232,9 +3232,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc2] ; X86-NEXT: vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3243,9 +3243,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc2] ; X64-NEXT: vpmovdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) @@ -3290,9 +3290,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc2] ; X86-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovsdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3301,9 +3301,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc2] ; X64-NEXT: vpmovsdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovsdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) @@ -3348,9 +3348,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc2] ; X86-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1] +; X86-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X86-NEXT: vpmovusdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc0] ; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3359,9 +3359,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovusdw %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc2] ; X64-NEXT: vpmovusdw %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1] +; X64-NEXT: vpaddw %xmm1, %xmm2, %xmm1 # encoding: [0xc5,0xe9,0xfd,0xc9] ; X64-NEXT: vpmovusdw %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc0] ; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %x0, <8 x i16> %x1, i8 -1) @@ -4357,9 +4357,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02] ; X86-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a] +; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3] ; X86-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3] -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2] +; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_vcvtps2ph_128: @@ -4367,9 +4367,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2ph $2, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x1d,0xc2,0x02] ; X64-NEXT: vcvtps2ph $10, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x1d,0xc3,0x0a] +; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3] ; X64-NEXT: vcvtps2ph $11, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x1d,0xc1,0x0b] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3] -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2] +; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) %res2 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 10, <8 x i16> zeroinitializer, i8 %mask) @@ -4388,9 +4388,9 @@ ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02] ; X86-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b] +; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3] ; X86-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3] -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2] +; X86-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4399,9 +4399,9 @@ ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2ph $2, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x1d,0xc2,0x02] ; X64-NEXT: vcvtps2ph $11, %ymm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x1d,0xc3,0x0b] +; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xfd,0xd3] ; X64-NEXT: vcvtps2ph $12, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x1d,0xc1,0x0c] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc3] -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfd,0xc2] +; X64-NEXT: vpaddw %xmm2, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0xfd,0xc2] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1) diff --git a/llvm/test/CodeGen/X86/bmi-out-of-order.ll b/llvm/test/CodeGen/X86/bmi-out-of-order.ll --- a/llvm/test/CodeGen/X86/bmi-out-of-order.ll +++ b/llvm/test/CodeGen/X86/bmi-out-of-order.ll @@ -68,10 +68,10 @@ ; X64-LABEL: blsmask_through2: ; X64: # %bb.0: # %entry ; X64-NEXT: # kill: def $esi killed $esi def $rsi -; X64-NEXT: xorl %edx, %edi -; X64-NEXT: xorl %esi, %edi ; X64-NEXT: leal -1(%rsi), %eax +; X64-NEXT: xorl %edx, %edi ; X64-NEXT: xorl %edi, %eax +; X64-NEXT: xorl %esi, %eax ; X64-NEXT: retq entry: %sub = add nsw i32 %b, -1 @@ -104,11 +104,11 @@ ; ; X64-LABEL: blsmask_through3: ; X64: # %bb.0: # %entry -; X64-NEXT: xorq %rdx, %rdi -; X64-NEXT: xorq %rcx, %rdi -; X64-NEXT: xorq %rsi, %rdi ; X64-NEXT: leaq -1(%rsi), %rax +; X64-NEXT: xorq %rdx, %rdi ; X64-NEXT: xorq %rdi, %rax +; X64-NEXT: xorq %rsi, %rcx +; X64-NEXT: xorq %rcx, %rax ; X64-NEXT: retq entry: %sub = add nsw i64 %b, -1 @@ -159,19 +159,19 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: addl $-1, %edi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: adcl $-1, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: xorl %ebx, %ebp +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: adcl $-1, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: xorl %ebp, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %edi, %eax -; X86-NEXT: xorl %ebp, %esi +; X86-NEXT: xorl %ebx, %esi ; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: imull %eax, %ebx +; X86-NEXT: imull %eax, %ebp ; X86-NEXT: mull %edi -; X86-NEXT: imull %edi, %ebp -; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: addl %ebp, %edx +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: popl %esi @@ -264,10 +264,10 @@ ; X64-LABEL: blsi_through2: ; X64: # %bb.0: # %entry ; X64-NEXT: movl %esi, %eax -; X64-NEXT: andl %edx, %edi -; X64-NEXT: andl %esi, %edi ; X64-NEXT: negl %eax +; X64-NEXT: andl %edx, %edi ; X64-NEXT: andl %edi, %eax +; X64-NEXT: andl %esi, %eax ; X64-NEXT: retq entry: %sub = sub i32 0, %b @@ -299,10 +299,10 @@ ; X64-LABEL: blsi_through3: ; X64: # %bb.0: # %entry ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: andq %rdx, %rdi -; X64-NEXT: andq %rsi, %rdi ; X64-NEXT: negq %rax +; X64-NEXT: andq %rdx, %rdi ; X64-NEXT: andq %rdi, %rax +; X64-NEXT: andq %rsi, %rax ; X64-NEXT: retq entry: %sub = sub i64 0, %b @@ -362,9 +362,9 @@ ; X86-NEXT: andl %eax, %ecx ; X86-NEXT: imull %edx, %ebx ; X86-NEXT: imull %eax, %edi -; X86-NEXT: addl %ebx, %edi ; X86-NEXT: mull %edx ; X86-NEXT: addl %edi, %edx +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: popl %esi @@ -456,10 +456,10 @@ ; X64-LABEL: blsr_through2: ; X64: # %bb.0: # %entry ; X64-NEXT: # kill: def $esi killed $esi def $rsi -; X64-NEXT: andl %edx, %edi -; X64-NEXT: andl %esi, %edi ; X64-NEXT: leal -1(%rsi), %eax +; X64-NEXT: andl %edx, %edi ; X64-NEXT: andl %edi, %eax +; X64-NEXT: andl %esi, %eax ; X64-NEXT: retq entry: %sub = add nsw i32 %b, -1 @@ -492,12 +492,12 @@ ; ; X64-LABEL: blsr_through3: ; X64: # %bb.0: # %entry -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: andq %rdx, %rdi -; X64-NEXT: andq %rcx, %rdi -; X64-NEXT: andq %rsi, %rdi -; X64-NEXT: negq %rax -; X64-NEXT: andq %rdi, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: andq %rsi, %rcx +; X64-NEXT: negq %rsi +; X64-NEXT: andq %rdx, %rax +; X64-NEXT: andq %rsi, %rax +; X64-NEXT: andq %rcx, %rax ; X64-NEXT: retq entry: %sub = sub nsw i64 0, %b @@ -548,19 +548,19 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: addl $-1, %edi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: adcl $-1, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: andl %ebx, %ebp +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: adcl $-1, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: andl %ebp, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl %edi, %eax -; X86-NEXT: andl %ebp, %esi +; X86-NEXT: andl %ebx, %esi ; X86-NEXT: andl %eax, %ecx -; X86-NEXT: imull %eax, %ebx +; X86-NEXT: imull %eax, %ebp ; X86-NEXT: mull %edi -; X86-NEXT: imull %edi, %ebp -; X86-NEXT: addl %ebx, %ebp ; X86-NEXT: addl %ebp, %edx +; X86-NEXT: imull %edi, %ebx +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -249,9 +249,9 @@ ; AVX1-NEXT: vpsubd 16(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpsubd (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) ; AVX1-NEXT: vmovdqu %xmm0, (%rsi) ; AVX1-NEXT: vmovdqu %xmm3, 16(%rdi) diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -124,10 +124,10 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: pushl %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: calll __divdi3 @@ -136,10 +136,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ecx, 4(%edx) ; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %ebp -; X86-NEXT: imull %ebp, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: imull %eax, %ebp +; X86-NEXT: mull %ebx +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: subl %eax, %esi ; X86-NEXT: sbbl %ecx, %edi @@ -178,141 +178,146 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $152, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: sarl $31, %ebp ; X86-NEXT: movl %eax, %edx -; X86-NEXT: xorl %esi, %edx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl %eax, %esi ; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %eax, %esi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %ebx -; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-NEXT: sbbl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: xorl %ebp, %edi -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ebp, %ebx ; X86-NEXT: xorl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: xorl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl %ecx, %esi -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edi +; X86-NEXT: subl %ebp, %edi +; X86-NEXT: sbbl %ebp, %ebx +; X86-NEXT: sbbl %ebp, %edx +; X86-NEXT: sbbl %ebp, %esi +; X86-NEXT: xorl %eax, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: orl %ebp, %ecx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: sete %cl +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: orl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: bsrl %edi, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: bsrl %ebp, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %eax, %edx +; X86-NEXT: sete %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: movb %al, (%esp) # 1-byte Spill +; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: addl $32, %edx -; X86-NEXT: testl %edi, %edi -; X86-NEXT: cmovnel %eax, %edx -; X86-NEXT: bsrl %ebx, %eax -; X86-NEXT: xorl $31, %eax -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: bsrl %esi, %ecx +; X86-NEXT: bsrl %ebp, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx +; X86-NEXT: testl %esi, %esi +; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: bsrl %ebx, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: bsrl %edi, %edi +; X86-NEXT: xorl $31, %edi +; X86-NEXT: addl $32, %edi ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: cmovnel %eax, %ecx -; X86-NEXT: addl $64, %ecx +; X86-NEXT: cmovnel %edx, %edi +; X86-NEXT: addl $64, %edi ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edi, %ebp -; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %ebp +; X86-NEXT: cmovnel %ecx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: bsrl %ebx, %esi +; X86-NEXT: bsrl %ebx, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: bsrl %ebp, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: addl $32, %ecx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %esi ; X86-NEXT: xorl $31, %esi -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %edx +; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx ; X86-NEXT: addl $32, %edx -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: testl %eax, %eax ; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: bsrl %ebp, %edi -; X86-NEXT: xorl $31, %edi -; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: xorl $31, %esi -; X86-NEXT: addl $32, %esi -; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: cmovnel %edi, %esi -; X86-NEXT: addl $64, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %ebx, %edi -; X86-NEXT: cmovnel %edx, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: subl %esi, %ecx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebp, %ebp +; X86-NEXT: addl $64, %edx +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: subl %edx, %edi +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: movl $0, %esi -; X86-NEXT: sbbl %esi, %esi -; X86-NEXT: movl $127, %edx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %ebp, %edx -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: movl $0, %edx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: setb %dl -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload -; X86-NEXT: cmovnel %edi, %ebx -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: cmovnel %edi, %esi +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: movl $127, %ecx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edi, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: setb %cl +; X86-NEXT: orb (%esp), %cl # 1-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovnel %edi, %eax -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: jne .LBB4_8 -; X86-NEXT: # %bb.1: # %_udiv-special-cases +; X86-NEXT: cmovnel %esi, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: cmovnel %esi, %ebp +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: cmovnel %esi, %eax +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # %bb.8: # %_udiv-special-cases +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: xorl $127, %edx -; X86-NEXT: orl %ecx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %ecx ; X86-NEXT: orl %edx, %ecx -; X86-NEXT: je .LBB4_8 -; X86-NEXT: # %bb.2: # %udiv-bb1 +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: je .LBB4_9 +; X86-NEXT: # %bb.5: # %udiv-bb1 ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -322,7 +327,6 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: xorb $127, %al ; X86-NEXT: movb %al, %ch @@ -330,43 +334,46 @@ ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: negb %al -; X86-NEXT: movsbl %al, %edi -; X86-NEXT: movl 144(%esp,%edi), %edx -; X86-NEXT: movl 148(%esp,%edi), %ebx +; X86-NEXT: movsbl %al, %esi +; X86-NEXT: movl 144(%esp,%esi), %edx +; X86-NEXT: movl 148(%esp,%esi), %ebx ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shldl %cl, %edx, %ebx ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 140(%esp,%edi), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: shrl %esi -; X86-NEXT: shrl %cl, %esi -; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 136(%esp,%edi), %edx +; X86-NEXT: movl 140(%esp,%esi), %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: shrl %edi +; X86-NEXT: shrl %cl, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 136(%esp,%esi), %edx ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shldl %cl, %edx, %eax ; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl $1, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: adcl $0, %edx -; X86-NEXT: jae .LBB4_3 +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl $0, %ebp +; X86-NEXT: jae .LBB4_2 ; X86-NEXT: # %bb.6: ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: jmp .LBB4_7 -; X86-NEXT: .LBB4_3: # %udiv-preheader +; X86-NEXT: .LBB4_1: +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: jmp .LBB4_9 +; X86-NEXT: .LBB4_2: # %udiv-preheader ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) @@ -374,10 +381,10 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movb %dl, %ch ; X86-NEXT: andb $7, %ch ; X86-NEXT: movb %dl, %cl @@ -385,28 +392,26 @@ ; X86-NEXT: andb $15, %cl ; X86-NEXT: movzbl %cl, %edx ; X86-NEXT: movl 100(%esp,%edx), %esi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 96(%esp,%edx), %ebp -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl 96(%esp,%edx), %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edi, %edx ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shrdl %cl, %esi, %edx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: movl 88(%esp,%eax), %ebx -; X86-NEXT: movl 92(%esp,%eax), %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 88(%esp,%ebx), %ebp +; X86-NEXT: movl 92(%esp,%ebx), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: notb %cl -; X86-NEXT: addl %ebp, %ebp -; X86-NEXT: shll %cl, %ebp -; X86-NEXT: orl %eax, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, %edi +; X86-NEXT: shll %cl, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shrl %cl, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: shrdl %cl, %eax, %ebx +; X86-NEXT: shrdl %cl, %ebx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -416,32 +421,31 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: .p2align 4, 0x90 -; X86-NEXT: .LBB4_4: # %udiv-do-while +; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: shldl $1, %edx, %ebp +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: shldl $1, %edx, (%esp) # 4-byte Folded Spill -; X86-NEXT: shldl $1, %ebx, %edx -; X86-NEXT: shldl $1, %edi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %edi +; X86-NEXT: shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl $1, %ebp, %edx +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %edi ; X86-NEXT: orl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: shldl $1, %esi, %ecx ; X86-NEXT: orl %eax, %ecx @@ -449,146 +453,141 @@ ; X86-NEXT: addl %esi, %esi ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl %edi, %esi +; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: subl %ecx, %ebp ; X86-NEXT: sbbl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: sbbl %esi, %ebp -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: adcl $-1, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $-1, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %esi, %ecx +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %ecx ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: jne .LBB4_4 -; X86-NEXT: # %bb.5: -; X86-NEXT: movl %edi, %ebx +; X86-NEXT: jne .LBB4_3 +; X86-NEXT: # %bb.4: +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $1, %esi, %ebx -; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: shldl $1, %eax, %esi -; X86-NEXT: orl %ecx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: shldl $1, %edi, %eax +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: shldl $1, %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %eax ; X86-NEXT: orl %ecx, %eax -; X86-NEXT: addl %edi, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: .LBB4_8: # %udiv-end +; X86-NEXT: addl %esi, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: .LBB4_9: # %udiv-end ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: xorl %ecx, %ebx -; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: xorl %ecx, %edi ; X86-NEXT: xorl %ecx, %eax -; X86-NEXT: movl %edi, %edx -; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: subl %ecx, %edx +; X86-NEXT: xorl %ecx, %esi +; X86-NEXT: subl %ecx, %esi ; X86-NEXT: sbbl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %ecx, %esi +; X86-NEXT: sbbl %ecx, %edi ; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edx, (%ecx) +; X86-NEXT: movl %esi, (%ecx) ; X86-NEXT: movl %eax, 4(%ecx) -; X86-NEXT: movl %esi, 8(%ecx) +; X86-NEXT: movl %edi, 8(%ecx) ; X86-NEXT: movl %ebx, 12(%ecx) ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %esi, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull %eax, %ecx ; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: imull {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %ecx, %edi ; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull %eax, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: imull %edx, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %ebp +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: imull %ebp, %edi +; X86-NEXT: mull %ebp +; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %esi, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: adcl %ecx, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl %edi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, (%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) ; X86-NEXT: addl $152, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -614,8 +613,8 @@ ; X64-NEXT: movq %rax, (%rbx) ; X64-NEXT: imulq %rax, %r14 ; X64-NEXT: mulq %r15 +; X64-NEXT: addq %r14, %rdx ; X64-NEXT: imulq %r15, %rcx -; X64-NEXT: addq %r14, %rcx ; X64-NEXT: addq %rdx, %rcx ; X64-NEXT: subq %rax, %r13 ; X64-NEXT: sbbq %rcx, %r12 diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -124,10 +124,10 @@ ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: pushl %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: calll __udivdi3 @@ -136,10 +136,10 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %ecx, 4(%edx) ; X86-NEXT: movl %eax, (%edx) -; X86-NEXT: imull %eax, %ebx -; X86-NEXT: mull %ebp -; X86-NEXT: imull %ebp, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: imull %eax, %ebp +; X86-NEXT: mull %ebx +; X86-NEXT: addl %ebp, %edx +; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: addl %edx, %ecx ; X86-NEXT: subl %eax, %esi ; X86-NEXT: sbbl %ecx, %edi @@ -177,109 +177,106 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $132, %esp +; X86-NEXT: subl $136, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: orl %ebp, %eax -; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sete (%esp) # 1-byte Folded Spill -; X86-NEXT: orl %edx, %ecx -; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sete %cl +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl {{[0-9]+}}(%esp), %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: bsrl %ebp, %esi -; X86-NEXT: xorl $31, %esi -; X86-NEXT: bsrl %ebx, %edx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: sete %al +; X86-NEXT: orb %cl, %al +; X86-NEXT: movb %al, (%esp) # 1-byte Spill +; X86-NEXT: bsrl %ebp, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: addl $32, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: bsrl %edi, %esi -; X86-NEXT: xorl $31, %esi -; X86-NEXT: bsrl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: bsrl %ebx, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: bsrl %edi, %edi +; X86-NEXT: xorl $31, %edi +; X86-NEXT: addl $32, %edi +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: cmovnel %edx, %edi +; X86-NEXT: addl $64, %edi +; X86-NEXT: movl %eax, %edx +; X86-NEXT: orl %ebp, %edx +; X86-NEXT: cmovnel %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: bsrl %ebp, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: bsrl %esi, %ecx +; X86-NEXT: movl %esi, %ebx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx -; X86-NEXT: testl %edi, %edi -; X86-NEXT: cmovnel %esi, %ecx -; X86-NEXT: addl $64, %ecx -; X86-NEXT: orl %ebp, %ebx +; X86-NEXT: testl %ebp, %ebp ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bsrl %eax, %esi ; X86-NEXT: xorl $31, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: bsrl %edx, %edx +; X86-NEXT: bsrl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: addl $32, %edx ; X86-NEXT: testl %eax, %eax ; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: bsrl %ebp, %edi -; X86-NEXT: xorl $31, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: bsrl %eax, %esi -; X86-NEXT: xorl $31, %esi -; X86-NEXT: addl $32, %esi -; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: cmovnel %edi, %esi -; X86-NEXT: addl $64, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: orl %ebx, %edi -; X86-NEXT: cmovnel %edx, %esi -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: subl %esi, %ecx -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl $64, %edx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: orl %ebp, %esi +; X86-NEXT: cmovnel %ecx, %edx +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: subl %edx, %edi +; X86-NEXT: movl %ebp, %edx ; X86-NEXT: movl $0, %ebp ; X86-NEXT: sbbl %ebp, %ebp ; X86-NEXT: movl $0, %esi ; X86-NEXT: sbbl %esi, %esi ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: movl $127, %edx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: cmpl %ecx, %edx -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %ebp, %edx -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: movl $0, %edx +; X86-NEXT: movl $127, %ecx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edi, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl $0, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl %eax, %edx -; X86-NEXT: setb %dl -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload -; X86-NEXT: orb (%esp), %dl # 1-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmovnel %edi, %eax -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: cmovnel %edi, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmovnel %edi, %ebx -; X86-NEXT: cmovel {{[0-9]+}}(%esp), %edi +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: setb %cl +; X86-NEXT: orb (%esp), %cl # 1-byte Folded Reload +; X86-NEXT: cmovnel %ebx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovnel %ebx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmovnel %ebx, %edi +; X86-NEXT: cmovel {{[0-9]+}}(%esp), %ebx ; X86-NEXT: jne .LBB4_1 ; X86-NEXT: # %bb.8: # %_udiv-special-cases -; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: xorl $127, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: je .LBB4_9 ; X86-NEXT: # %bb.5: # %udiv-bb1 +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -292,47 +289,49 @@ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: xorb $127, %al -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %al, %ch ; X86-NEXT: andb $7, %ch ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax -; X86-NEXT: movl 124(%esp,%eax), %edx -; X86-NEXT: movl 128(%esp,%eax), %esi +; X86-NEXT: movl 128(%esp,%eax), %edx +; X86-NEXT: movl 132(%esp,%eax), %esi ; X86-NEXT: movb %ch, %cl ; X86-NEXT: shldl %cl, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shll %cl, %edx ; X86-NEXT: notb %cl -; X86-NEXT: movl 120(%esp,%eax), %ebp -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl 124(%esp,%eax), %edi +; X86-NEXT: movl %edi, %esi ; X86-NEXT: shrl %esi ; X86-NEXT: shrl %cl, %esi ; X86-NEXT: orl %edx, %esi -; X86-NEXT: movl 116(%esp,%eax), %edi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 120(%esp,%eax), %ebx ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shldl %cl, %edi, %ebp -; X86-NEXT: shll %cl, %edi -; X86-NEXT: addl $1, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %ebx, %edi +; X86-NEXT: shll %cl, %ebx +; X86-NEXT: addl $1, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $0, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: jae .LBB4_2 ; X86-NEXT: # %bb.6: -; X86-NEXT: xorl %edx, %edx ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_1: -; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: jmp .LBB4_9 ; X86-NEXT: .LBB4_2: # %udiv-preheader ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -356,16 +355,15 @@ ; X86-NEXT: shrb $3, %al ; X86-NEXT: andb $15, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: movl 80(%esp,%eax), %edx -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl 84(%esp,%eax), %esi +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 76(%esp,%eax), %edi -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %edi ; X86-NEXT: movl %edi, %ebx ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrdl %cl, %edx, %ebx -; X86-NEXT: movl 68(%esp,%eax), %ebp -; X86-NEXT: movl 72(%esp,%eax), %edx +; X86-NEXT: shrdl %cl, %esi, %ebx +; X86-NEXT: movl 72(%esp,%eax), %ebp +; X86-NEXT: movl 76(%esp,%eax), %edx ; X86-NEXT: movl %edx, %eax ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: notb %cl @@ -374,7 +372,8 @@ ; X86-NEXT: orl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movb %ch, %cl -; X86-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: shrdl %cl, %edx, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -391,30 +390,32 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB4_3: # %udiv-do-while ; X86-NEXT: # =>This Inner Loop Header: Depth=1 ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shldl $1, %ebx, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: shldl $1, %ebx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: shldl $1, %edx, %ebx -; X86-NEXT: shldl $1, %eax, %edx -; X86-NEXT: shldl $1, %esi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: orl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: shldl $1, %ecx, %esi -; X86-NEXT: orl %ebp, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shldl $1, %eax, %ecx -; X86-NEXT: orl %ebp, %ecx +; X86-NEXT: orl %esi, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %eax, %eax ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -423,17 +424,17 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: sbbl (%esp), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ecx ; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: andl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: andl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: andl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx @@ -442,111 +443,113 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: sbbl %eax, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: sbbl %ebp, %ebx ; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: sbbl %ebp, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl $-1, %ecx ; X86-NEXT: adcl $-1, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl $-1, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: adcl $-1, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: adcl $-1, %esi ; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %ebp, %eax +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebp, %ecx ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: jne .LBB4_3 ; X86-NEXT: # %bb.4: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: .LBB4_7: # %udiv-loop-exit -; X86-NEXT: shldl $1, %esi, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: shldl $1, %ebp, %esi -; X86-NEXT: orl %ecx, %esi -; X86-NEXT: shldl $1, %edi, %ebp -; X86-NEXT: orl %ecx, %ebp -; X86-NEXT: addl %edi, %edi -; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: shldl $1, %edi, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: shldl $1, %ebx, %edi +; X86-NEXT: orl %eax, %edi +; X86-NEXT: addl %ebx, %ebx +; X86-NEXT: orl %ecx, %ebx ; X86-NEXT: .LBB4_9: # %udiv-end -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edi, (%ecx) -; X86-NEXT: movl %ebp, 4(%ecx) -; X86-NEXT: movl %esi, 8(%ecx) -; X86-NEXT: movl %eax, 12(%ecx) -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %ebp, %esi +; X86-NEXT: imull %eax, %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: imull %ebp, %ecx -; X86-NEXT: movl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: imull %edi, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: mull %edi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: imull %ebx, %ebp ; X86-NEXT: addl %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: imull %esi, %ecx +; X86-NEXT: addl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: imull %ebx, %edi +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: imull %ebx, %eax -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: imull %edi, %esi -; X86-NEXT: addl %eax, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: addl (%esp), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebp, %esi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %ecx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %edi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movl %eax, %esi ; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull %edi +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl (%esp), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl %ebp, %esi +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: subl (%esp), %ebp # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl %edx, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: addl $132, %esp +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: addl $136, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -571,8 +574,8 @@ ; X64-NEXT: movq %rax, (%rbx) ; X64-NEXT: imulq %rax, %r14 ; X64-NEXT: mulq %r15 +; X64-NEXT: addq %r14, %rdx ; X64-NEXT: imulq %r15, %rcx -; X64-NEXT: addq %r14, %rcx ; X64-NEXT: addq %rdx, %rcx ; X64-NEXT: subq %rax, %r13 ; X64-NEXT: sbbq %rcx, %r12 diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -785,9 +785,9 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA -; X32-NEXT: imull $-1431655765, %edi, %esi # imm = 0xAAAAAAAB -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: addl %esi, %edx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -828,9 +828,9 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: imull $-858993460, %ecx, %ecx # imm = 0xCCCCCCCC -; X32-NEXT: imull $-858993459, %edi, %esi # imm = 0xCCCCCCCD -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: addl %esi, %edx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-858993459, %edi, %ecx # imm = 0xCCCCCCCD +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -872,9 +872,9 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %edx ; X32-NEXT: imull $-286331154, %ecx, %ecx # imm = 0xEEEEEEEE -; X32-NEXT: imull $-286331153, %edi, %esi # imm = 0xEEEEEEEF -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: addl %esi, %edx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-286331153, %edi, %ecx # imm = 0xEEEEEEEF +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl @@ -916,9 +916,9 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: imull $-252645136, %ecx, %ecx # imm = 0xF0F0F0F0 -; X32-NEXT: imull $-252645135, %edi, %esi # imm = 0xF0F0F0F1 -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: addl %esi, %edx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-252645135, %edi, %ecx # imm = 0xF0F0F0F1 +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -961,9 +961,9 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %edx ; X32-NEXT: imull $-16843010, %ecx, %ecx # imm = 0xFEFEFEFE -; X32-NEXT: imull $-16843009, %esi, %esi # imm = 0xFEFEFEFF -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: addl %esi, %edx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-16843009, %esi, %ecx # imm = 0xFEFEFEFF +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: popl %esi ; X32-NEXT: retl ; @@ -1004,9 +1004,9 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: imull $-16711936, %ecx, %ecx # imm = 0xFF00FF00 -; X32-NEXT: imull $-16711935, %edi, %esi # imm = 0xFF00FF01 -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: addl %esi, %edx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-16711935, %edi, %ecx # imm = 0xFF00FF01 +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx @@ -1140,9 +1140,9 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA -; X32-NEXT: imull $-1431655765, %edi, %esi # imm = 0xAAAAAAAB -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: addl %esi, %edx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull $-1431655765, %edi, %ecx # imm = 0xAAAAAAAB +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -482,8 +482,8 @@ ; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 -; X86-64-NEXT: addq %rcx, %r8 ; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; @@ -505,8 +505,8 @@ ; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 -; WIN64-NEXT: addq %r9, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: @@ -532,8 +532,8 @@ ; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 -; X86-64-NEXT: addq %rcx, %r8 ; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; @@ -555,8 +555,8 @@ ; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 -; WIN64-NEXT: addq %r9, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: @@ -584,8 +584,8 @@ ; X86-64-NEXT: movabsq $-1229782938247303441, %r8 # imm = 0xEEEEEEEEEEEEEEEF ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 -; X86-64-NEXT: addq %rcx, %r8 ; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; @@ -609,8 +609,8 @@ ; WIN64-NEXT: movabsq $-1229782938247303441, %r10 # imm = 0xEEEEEEEEEEEEEEEF ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 -; WIN64-NEXT: addq %r9, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: @@ -638,8 +638,8 @@ ; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 -; X86-64-NEXT: addq %rcx, %r8 ; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; @@ -663,8 +663,8 @@ ; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 -; WIN64-NEXT: addq %r9, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: @@ -694,8 +694,8 @@ ; X86-64-NEXT: movabsq $-72340172838076673, %r8 # imm = 0xFEFEFEFEFEFEFEFF ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 -; X86-64-NEXT: addq %rcx, %r8 ; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; @@ -721,8 +721,8 @@ ; WIN64-NEXT: movabsq $-72340172838076673, %r10 # imm = 0xFEFEFEFEFEFEFEFF ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 -; WIN64-NEXT: addq %r9, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: @@ -750,8 +750,8 @@ ; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 -; X86-64-NEXT: addq %rcx, %r8 ; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; @@ -775,8 +775,8 @@ ; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 -; WIN64-NEXT: addq %r9, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: @@ -806,8 +806,8 @@ ; X86-64-NEXT: movabsq $-281479271743489, %r8 # imm = 0xFFFEFFFEFFFEFFFF ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 -; X86-64-NEXT: addq %rcx, %r8 ; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; @@ -833,8 +833,8 @@ ; WIN64-NEXT: movabsq $-281479271743489, %r10 # imm = 0xFFFEFFFEFFFEFFFF ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 -; WIN64-NEXT: addq %r9, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: @@ -862,8 +862,8 @@ ; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 -; X86-64-NEXT: addq %rcx, %r8 ; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; @@ -887,8 +887,8 @@ ; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 -; WIN64-NEXT: addq %r9, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: @@ -916,8 +916,8 @@ ; X86-64-NEXT: imulq %rdi, %rcx ; X86-64-NEXT: movq %rdi, %rax ; X86-64-NEXT: mulq %r8 +; X86-64-NEXT: addq %rcx, %rdx ; X86-64-NEXT: imulq %rsi, %r8 -; X86-64-NEXT: addq %rcx, %r8 ; X86-64-NEXT: addq %r8, %rdx ; X86-64-NEXT: retq ; @@ -941,8 +941,8 @@ ; WIN64-NEXT: imulq %rcx, %r9 ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %r10 +; WIN64-NEXT: addq %r9, %rdx ; WIN64-NEXT: imulq %r10, %r8 -; WIN64-NEXT: addq %r9, %r8 ; WIN64-NEXT: addq %r8, %rdx ; WIN64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/fold-add.ll b/llvm/test/CodeGen/X86/fold-add.ll --- a/llvm/test/CodeGen/X86/fold-add.ll +++ b/llvm/test/CodeGen/X86/fold-add.ll @@ -171,10 +171,10 @@ ; ; MPIC-LABEL: neg_0x80000001: ; MPIC: # %bb.0: # %entry -; MPIC-NEXT: leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx -; MPIC-NEXT: movabsq $foo@GOTOFF, %rdx +; MPIC-NEXT: leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax +; MPIC-NEXT: movabsq $foo@GOTOFF, %rcx +; MPIC-NEXT: addq %rax, %rcx ; MPIC-NEXT: movabsq $-2147483649, %rax # imm = 0xFFFFFFFF7FFFFFFF -; MPIC-NEXT: addq %rdx, %rax ; MPIC-NEXT: addq %rcx, %rax ; MPIC-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll --- a/llvm/test/CodeGen/X86/fold-masked-merge.ll +++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll @@ -149,17 +149,17 @@ ; NOBMI-LABEL: not_a_masked_merge2: ; NOBMI: # %bb.0: ; NOBMI-NEXT: movl %edi, %eax +; NOBMI-NEXT: orl %edi, %esi ; NOBMI-NEXT: notl %eax ; NOBMI-NEXT: andl %edx, %eax ; NOBMI-NEXT: orl %esi, %eax -; NOBMI-NEXT: orl %edi, %eax ; NOBMI-NEXT: retq ; ; BMI-LABEL: not_a_masked_merge2: ; BMI: # %bb.0: +; BMI-NEXT: orl %edi, %esi ; BMI-NEXT: andnl %edx, %edi, %eax ; BMI-NEXT: orl %esi, %eax -; BMI-NEXT: orl %edi, %eax ; BMI-NEXT: retq %not_an_and0 = or i32 %a0, %a1 %not = xor i32 %a0, -1 diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll --- a/llvm/test/CodeGen/X86/fold-tied-op.ll +++ b/llvm/test/CodeGen/X86/fold-tied-op.ll @@ -24,85 +24,87 @@ ; CHECK-NEXT: .cfi_offset %esi, -20 ; CHECK-NEXT: .cfi_offset %edi, -16 ; CHECK-NEXT: .cfi_offset %ebx, -12 -; CHECK-NEXT: movl $-1028477379, %edi # imm = 0xC2B2AE3D -; CHECK-NEXT: movl $668265295, %ebx # imm = 0x27D4EB4F -; CHECK-NEXT: movl a, %eax -; CHECK-NEXT: cmpl $0, (%eax) +; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D +; CHECK-NEXT: movl $668265295, %ecx # imm = 0x27D4EB4F +; CHECK-NEXT: movl a, %edi +; CHECK-NEXT: cmpl $0, (%edi) ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl 8(%eax), %edi -; CHECK-NEXT: movl 12(%eax), %esi -; CHECK-NEXT: movl %esi, %edx -; CHECK-NEXT: shldl $1, %edi, %edx -; CHECK-NEXT: orl %esi, %edx -; CHECK-NEXT: leal (%edi,%edi), %ecx -; CHECK-NEXT: orl %edi, %ecx -; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 16(%eax), %ecx -; CHECK-NEXT: movl 20(%eax), %esi -; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shldl $2, %ecx, %edi -; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl %esi, %edi -; CHECK-NEXT: shldl $31, %ecx, %edi -; CHECK-NEXT: shll $2, %ecx -; CHECK-NEXT: orl %edi, %ecx +; CHECK-NEXT: movl 8(%edi), %esi +; CHECK-NEXT: movl 12(%edi), %eax +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: shldl $1, %esi, %edx +; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: leal (%esi,%esi), %eax +; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 16(%edi), %ebx +; CHECK-NEXT: movl 20(%edi), %esi +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: shldl $2, %ebx, %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %ebx, %eax +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: shldl $31, %eax, %ebx +; CHECK-NEXT: shll $2, %eax +; CHECK-NEXT: orl %ebx, %eax ; CHECK-NEXT: shrl %esi ; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: adcl %edx, %esi -; CHECK-NEXT: movl 28(%eax), %ecx -; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 24(%eax), %eax +; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl 24(%edi), %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl $-1028477379, %ecx # imm = 0xC2B2AE3D -; CHECK-NEXT: imull %eax, %ecx -; CHECK-NEXT: mull %ebx -; CHECK-NEXT: movl %eax, %edi -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D ; CHECK-NEXT: imull %eax, %ebx -; CHECK-NEXT: addl %ecx, %ebx -; CHECK-NEXT: addl %edx, %ebx -; CHECK-NEXT: imull $1336530590, %eax, %ecx # imm = 0x4FA9D69E -; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; CHECK-NEXT: imull $-2056954758, %edx, %eax # imm = 0x85655C7A -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: mull %ecx +; CHECK-NEXT: movl %eax, %esi +; CHECK-NEXT: addl %ebx, %edx +; CHECK-NEXT: movl 28(%edi), %edi +; CHECK-NEXT: imull %edi, %ecx +; CHECK-NEXT: addl %edx, %ecx ; CHECK-NEXT: movl $1336530590, %edx # imm = 0x4FA9D69E +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; CHECK-NEXT: movl %ebx, %eax ; CHECK-NEXT: mull %edx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: shrdl $3, %ebx, %edi -; CHECK-NEXT: sarl $3, %ebx -; CHECK-NEXT: orl %ecx, %ebx -; CHECK-NEXT: orl %eax, %edi -; CHECK-NEXT: imull $326129324, %edi, %eax # imm = 0x137056AC -; CHECK-NEXT: imull $-66860409, %ebx, %ecx # imm = 0xFC03CA87 -; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: imull $-2056954758, %ebx, %ebx # imm = 0x85655C7A +; CHECK-NEXT: addl %edx, %ebx +; CHECK-NEXT: imull $1336530590, %edi, %edx # imm = 0x4FA9D69E +; CHECK-NEXT: addl %ebx, %edx +; CHECK-NEXT: shrdl $3, %ecx, %esi +; CHECK-NEXT: sarl $3, %ecx +; CHECK-NEXT: orl %edx, %ecx +; CHECK-NEXT: orl %eax, %esi ; CHECK-NEXT: movl $-66860409, %ebx # imm = 0xFC03CA87 +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: mull %ebx +; CHECK-NEXT: movl %eax, %edi +; CHECK-NEXT: imull $326129324, %esi, %eax # imm = 0x137056AC +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87 +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; CHECK-NEXT: movl %edi, b ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: mull %ebx -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: xorl %esi, %ecx -; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; CHECK-NEXT: imull $326129324, %edi, %esi # imm = 0x137056AC +; CHECK-NEXT: addl %edx, %esi ; CHECK-NEXT: movl %ecx, b+4 -; CHECK-NEXT: imull $326129324, %eax, %edx # imm = 0x137056AC ; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87 -; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: movl %eax, b -; CHECK-NEXT: mull %ebx ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.else -; CHECK-NEXT: xorl b+4, %edi -; CHECK-NEXT: xorl b, %ebx -; CHECK-NEXT: movl $1419758215, %ecx # imm = 0x549FCA87 -; CHECK-NEXT: movl %ebx, %eax -; CHECK-NEXT: mull %ecx -; CHECK-NEXT: imull $93298681, %ebx, %esi # imm = 0x58F9FF9 -; CHECK-NEXT: imull $1419758215, %edi, %ecx # imm = 0x549FCA87 -; CHECK-NEXT: addl %esi, %ecx +; CHECK-NEXT: xorl b+4, %ebx +; CHECK-NEXT: xorl b, %ecx +; CHECK-NEXT: movl $1419758215, %edx # imm = 0x549FCA87 +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: mull %edx +; CHECK-NEXT: imull $93298681, %ecx, %esi # imm = 0x58F9FF9 +; CHECK-NEXT: addl %edx, %esi +; CHECK-NEXT: imull $1419758215, %ebx, %ecx # imm = 0x549FCA87 ; CHECK-NEXT: .LBB0_3: # %if.end -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: addl %esi, %ecx ; CHECK-NEXT: addl $-1028477341, %eax # imm = 0xC2B2AE63 ; CHECK-NEXT: adcl $-2048144777, %ecx # imm = 0x85EBCA77 ; CHECK-NEXT: movl %eax, b diff --git a/llvm/test/CodeGen/X86/h-registers-1.ll b/llvm/test/CodeGen/X86/h-registers-1.ll --- a/llvm/test/CodeGen/X86/h-registers-1.ll +++ b/llvm/test/CodeGen/X86/h-registers-1.ll @@ -30,11 +30,11 @@ ; CHECK-NEXT: movzbl {{[0-9]+}}(%rsp), %r8d ; CHECK-NEXT: addq %rdi, %rsi ; CHECK-NEXT: addq %rbp, %rdx +; CHECK-NEXT: addq %rsi, %rdx ; CHECK-NEXT: addq %rbx, %rcx ; CHECK-NEXT: addq %r8, %rax ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: addq %rdx, %rax -; CHECK-NEXT: addq %rsi, %rax ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: popq %rbp @@ -63,11 +63,11 @@ ; GNUX32-NEXT: movzbl {{[0-9]+}}(%esp), %r8d ; GNUX32-NEXT: addq %rdi, %rsi ; GNUX32-NEXT: addq %rbp, %rdx +; GNUX32-NEXT: addq %rsi, %rdx ; GNUX32-NEXT: addq %rbx, %rcx ; GNUX32-NEXT: addq %r8, %rax ; GNUX32-NEXT: addq %rcx, %rax ; GNUX32-NEXT: addq %rdx, %rax -; GNUX32-NEXT: addq %rsi, %rax ; GNUX32-NEXT: popq %rbx ; GNUX32-NEXT: .cfi_def_cfa_offset 16 ; GNUX32-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/hipe-cc.ll b/llvm/test/CodeGen/X86/hipe-cc.ll --- a/llvm/test/CodeGen/X86/hipe-cc.ll +++ b/llvm/test/CodeGen/X86/hipe-cc.ll @@ -36,7 +36,7 @@ define cc 11 {i32, i32, i32} @addfour(i32 %hp, i32 %p, i32 %x, i32 %y, i32 %z) nounwind { ; CHECK-LABEL: addfour: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/hipe-cc64.ll b/llvm/test/CodeGen/X86/hipe-cc64.ll --- a/llvm/test/CodeGen/X86/hipe-cc64.ll +++ b/llvm/test/CodeGen/X86/hipe-cc64.ll @@ -40,9 +40,9 @@ define cc 11 {i64, i64, i64} @addfour(i64 %hp, i64 %p, i64 %x, i64 %y, i64 %z, i64 %w) nounwind { ; CHECK-LABEL: addfour: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: leaq (%rdx,%rcx), %rax -; CHECK-NEXT: addq %r8, %rax -; CHECK-NEXT: addq %rsi, %rax +; CHECK-NEXT: addq %rsi, %rdx +; CHECK-NEXT: leaq (%rcx,%r8), %rax +; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq entry: %0 = add i64 %x, %y diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-add.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-add.ll @@ -320,36 +320,36 @@ ; SSE2-LABEL: PR37890_v16i32: ; SSE2: # %bb.0: ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; SSSE3-SLOW-LABEL: PR37890_v16i32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1 -; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm0 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 -; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm1 -; SSSE3-SLOW-NEXT: movd %xmm1, %eax +; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0 +; SSSE3-SLOW-NEXT: movd %xmm0, %eax ; SSSE3-SLOW-NEXT: retq ; ; SSSE3-FAST-LABEL: PR37890_v16i32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm1 -; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1 -; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1 -; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSSE3-FAST-NEXT: paddd %xmm2, %xmm0 ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0 -; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0 -; SSSE3-FAST-NEXT: movd %xmm0, %eax +; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSSE3-FAST-NEXT: paddd %xmm0, %xmm1 +; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1 +; SSSE3-FAST-NEXT: movd %xmm1, %eax ; SSSE3-FAST-NEXT: retq ; ; AVX1-SLOW-LABEL: PR37890_v16i32: @@ -357,8 +357,8 @@ ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-fadd.ll @@ -196,7 +196,7 @@ ; SSE2-LABEL: PR37890_v8f64: ; SSE2: # %bb.0: ; SSE2-NEXT: addpd %xmm3, %xmm1 -; SSE2-NEXT: addpd %xmm2, %xmm1 +; SSE2-NEXT: addpd %xmm2, %xmm0 ; SSE2-NEXT: addpd %xmm1, %xmm0 ; SSE2-NEXT: movapd %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -206,7 +206,7 @@ ; SSSE3-SLOW-LABEL: PR37890_v8f64: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: addpd %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: addpd %xmm2, %xmm0 ; SSSE3-SLOW-NEXT: addpd %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: movapd %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -216,7 +216,7 @@ ; SSSE3-FAST-LABEL: PR37890_v8f64: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: addpd %xmm3, %xmm1 -; SSSE3-FAST-NEXT: addpd %xmm2, %xmm1 +; SSSE3-FAST-NEXT: addpd %xmm2, %xmm0 ; SSSE3-FAST-NEXT: addpd %xmm1, %xmm0 ; SSSE3-FAST-NEXT: haddpd %xmm0, %xmm0 ; SSSE3-FAST-NEXT: retq @@ -265,7 +265,7 @@ ; SSE2-LABEL: PR37890_v16f32: ; SSE2: # %bb.0: ; SSE2-NEXT: addps %xmm3, %xmm1 -; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: addps %xmm2, %xmm0 ; SSE2-NEXT: addps %xmm1, %xmm0 ; SSE2-NEXT: movaps %xmm0, %xmm1 ; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -278,7 +278,7 @@ ; SSSE3-SLOW-LABEL: PR37890_v16f32: ; SSSE3-SLOW: # %bb.0: ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm1 -; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1 +; SSSE3-SLOW-NEXT: addps %xmm2, %xmm0 ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0 ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1 ; SSSE3-SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] @@ -290,7 +290,7 @@ ; SSSE3-FAST-LABEL: PR37890_v16f32: ; SSSE3-FAST: # %bb.0: ; SSSE3-FAST-NEXT: addps %xmm3, %xmm1 -; SSSE3-FAST-NEXT: addps %xmm2, %xmm1 +; SSSE3-FAST-NEXT: addps %xmm2, %xmm0 ; SSSE3-FAST-NEXT: addps %xmm1, %xmm0 ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm1 ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -1321,13 +1321,13 @@ ; X86-SSE42-LABEL: test_reduce_v16i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxsd %xmm3, %xmm1 -; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm1 -; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: pmaxsd %xmm2, %xmm0 ; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmaxsd %xmm0, %xmm1 -; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v16i32: @@ -1335,8 +1335,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1393,13 +1393,13 @@ ; X64-SSE42-LABEL: test_reduce_v16i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxsd %xmm3, %xmm1 -; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm1 -; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE42-NEXT: pmaxsd %xmm2, %xmm0 ; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmaxsd %xmm0, %xmm1 -; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE42-NEXT: pmaxsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v16i32: @@ -1407,8 +1407,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1463,26 +1463,26 @@ ; X86-SSE2-LABEL: test_reduce_v32i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pmaxsw %xmm3, %xmm1 -; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm1 -; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pmaxsw %xmm2, %xmm0 ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: test_reduce_v32i16: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxsw %xmm3, %xmm1 -; X86-SSE42-NEXT: pmaxsw %xmm2, %xmm1 -; X86-SSE42-NEXT: pmaxsw %xmm0, %xmm1 -; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pmaxsw %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxsw %xmm1, %xmm0 +; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 ; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax @@ -1493,8 +1493,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -1519,26 +1519,26 @@ ; X64-SSE2-LABEL: test_reduce_v32i16: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pmaxsw %xmm3, %xmm1 -; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm1 -; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: pmaxsw %xmm2, %xmm0 ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X64-SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: test_reduce_v32i16: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxsw %xmm3, %xmm1 -; X64-SSE42-NEXT: pmaxsw %xmm2, %xmm1 -; X64-SSE42-NEXT: pmaxsw %xmm0, %xmm1 -; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pmaxsw %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxsw %xmm1, %xmm0 +; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 ; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: xorl $32767, %eax ## imm = 0x7FFF ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax @@ -1549,8 +1549,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax @@ -1655,13 +1655,13 @@ ; X86-SSE42-LABEL: test_reduce_v64i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxsb %xmm3, %xmm1 -; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm1 -; X86-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 -; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pmaxsb %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: xorb $127, %al ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax @@ -1672,8 +1672,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 @@ -1749,13 +1749,13 @@ ; X64-SSE42-LABEL: test_reduce_v64i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxsb %xmm3, %xmm1 -; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm1 -; X64-SSE42-NEXT: pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 -; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pmaxsb %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: xorb $127, %al ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax @@ -1766,8 +1766,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -1325,13 +1325,13 @@ ; X86-SSE42-LABEL: test_reduce_v16i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminsd %xmm3, %xmm1 -; X86-SSE42-NEXT: pminsd %xmm2, %xmm1 -; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: pminsd %xmm2, %xmm0 ; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pminsd %xmm0, %xmm1 -; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v16i32: @@ -1339,8 +1339,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1397,13 +1397,13 @@ ; X64-SSE42-LABEL: test_reduce_v16i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminsd %xmm3, %xmm1 -; X64-SSE42-NEXT: pminsd %xmm2, %xmm1 -; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE42-NEXT: pminsd %xmm2, %xmm0 ; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pminsd %xmm0, %xmm1 -; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE42-NEXT: pminsd %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v16i32: @@ -1411,8 +1411,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1467,26 +1467,26 @@ ; X86-SSE2-LABEL: test_reduce_v32i16: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pminsw %xmm3, %xmm1 -; X86-SSE2-NEXT: pminsw %xmm2, %xmm1 -; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pminsw %xmm2, %xmm0 ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X86-SSE2-NEXT: movd %xmm0, %eax +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 +; X86-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X86-SSE2-NEXT: movd %xmm1, %eax ; X86-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: test_reduce_v32i16: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminsw %xmm3, %xmm1 -; X86-SSE42-NEXT: pminsw %xmm2, %xmm1 -; X86-SSE42-NEXT: pminsw %xmm0, %xmm1 -; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pminsw %xmm2, %xmm0 +; X86-SSE42-NEXT: pminsw %xmm1, %xmm0 +; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 ; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax @@ -1497,8 +1497,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpminsw %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -1523,26 +1523,26 @@ ; X64-SSE2-LABEL: test_reduce_v32i16: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pminsw %xmm3, %xmm1 -; X64-SSE2-NEXT: pminsw %xmm2, %xmm1 -; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: pminsw %xmm2, %xmm0 ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X64-SSE2-NEXT: pminsw %xmm1, %xmm0 -; X64-SSE2-NEXT: movd %xmm0, %eax +; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 +; X64-SSE2-NEXT: pminsw %xmm0, %xmm1 +; X64-SSE2-NEXT: movd %xmm1, %eax ; X64-SSE2-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: test_reduce_v32i16: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminsw %xmm3, %xmm1 -; X64-SSE42-NEXT: pminsw %xmm2, %xmm1 -; X64-SSE42-NEXT: pminsw %xmm0, %xmm1 -; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pminsw %xmm2, %xmm0 +; X64-SSE42-NEXT: pminsw %xmm1, %xmm0 +; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 ; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: xorl $32768, %eax ## imm = 0x8000 ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax @@ -1553,8 +1553,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpminsw %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax @@ -1659,13 +1659,13 @@ ; X86-SSE42-LABEL: test_reduce_v64i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminsb %xmm3, %xmm1 -; X86-SSE42-NEXT: pminsb %xmm2, %xmm1 -; X86-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 -; X86-SSE42-NEXT: pminub %xmm1, %xmm0 -; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pminsb %xmm2, %xmm0 +; X86-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: addb $-128, %al ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax @@ -1676,8 +1676,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpminsb %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 @@ -1753,13 +1753,13 @@ ; X64-SSE42-LABEL: test_reduce_v64i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminsb %xmm3, %xmm1 -; X64-SSE42-NEXT: pminsb %xmm2, %xmm1 -; X64-SSE42-NEXT: pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 -; X64-SSE42-NEXT: pminub %xmm1, %xmm0 -; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pminsb %xmm2, %xmm0 +; X64-SSE42-NEXT: pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: addb $-128, %al ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax @@ -1770,8 +1770,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpminsb %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -1486,13 +1486,13 @@ ; X86-SSE42-LABEL: test_reduce_v16i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxud %xmm3, %xmm1 -; X86-SSE42-NEXT: pmaxud %xmm2, %xmm1 -; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: pmaxud %xmm2, %xmm0 ; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pmaxud %xmm0, %xmm1 -; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v16i32: @@ -1500,8 +1500,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1573,13 +1573,13 @@ ; X64-SSE42-LABEL: test_reduce_v16i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxud %xmm3, %xmm1 -; X64-SSE42-NEXT: pmaxud %xmm2, %xmm1 -; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE42-NEXT: pmaxud %xmm2, %xmm0 ; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pmaxud %xmm0, %xmm1 -; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE42-NEXT: pmaxud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v16i32: @@ -1587,8 +1587,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1665,11 +1665,11 @@ ; X86-SSE42-LABEL: test_reduce_v32i16: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxuw %xmm3, %xmm1 -; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm1 -; X86-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X86-SSE42-NEXT: pcmpeqd %xmm0, %xmm0 -; X86-SSE42-NEXT: pxor %xmm1, %xmm0 -; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: pmaxuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: notl %eax ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax @@ -1680,8 +1680,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -1730,11 +1730,11 @@ ; X64-SSE42-LABEL: test_reduce_v32i16: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxuw %xmm3, %xmm1 -; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm1 -; X64-SSE42-NEXT: pmaxuw %xmm0, %xmm1 -; X64-SSE42-NEXT: pcmpeqd %xmm0, %xmm0 -; X64-SSE42-NEXT: pxor %xmm1, %xmm0 -; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: pmaxuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: notl %eax ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax @@ -1745,8 +1745,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -1806,33 +1806,33 @@ ; X86-SSE2-LABEL: test_reduce_v64i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pmaxub %xmm3, %xmm1 -; X86-SSE2-NEXT: pmaxub %xmm2, %xmm1 -; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pmaxub %xmm2, %xmm0 ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $8, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 ; X86-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: test_reduce_v64i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pmaxub %xmm3, %xmm1 -; X86-SSE42-NEXT: pmaxub %xmm2, %xmm1 -; X86-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT: pcmpeqd %xmm0, %xmm0 -; X86-SSE42-NEXT: pxor %xmm1, %xmm0 -; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT: psrlw $8, %xmm1 -; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pmaxub %xmm2, %xmm0 +; X86-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT: pxor %xmm0, %xmm1 +; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminub %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 ; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: notb %al ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax @@ -1843,8 +1843,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -1875,33 +1875,33 @@ ; X64-SSE2-LABEL: test_reduce_v64i8: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pmaxub %xmm3, %xmm1 -; X64-SSE2-NEXT: pmaxub %xmm2, %xmm1 -; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: pmaxub %xmm2, %xmm0 ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrlw $8, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 ; X64-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: test_reduce_v64i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pmaxub %xmm3, %xmm1 -; X64-SSE42-NEXT: pmaxub %xmm2, %xmm1 -; X64-SSE42-NEXT: pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT: pcmpeqd %xmm0, %xmm0 -; X64-SSE42-NEXT: pxor %xmm1, %xmm0 -; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT: psrlw $8, %xmm1 -; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pmaxub %xmm2, %xmm0 +; X64-SSE42-NEXT: pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT: pxor %xmm0, %xmm1 +; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminub %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 ; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: notb %al ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax @@ -1912,8 +1912,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -1404,13 +1404,13 @@ ; X86-SSE42-LABEL: test_reduce_v16i32: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminud %xmm3, %xmm1 -; X86-SSE42-NEXT: pminud %xmm2, %xmm1 -; X86-SSE42-NEXT: pminud %xmm0, %xmm1 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE42-NEXT: pminud %xmm2, %xmm0 ; X86-SSE42-NEXT: pminud %xmm1, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE42-NEXT: pminud %xmm0, %xmm1 -; X86-SSE42-NEXT: movd %xmm1, %eax +; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X86-SSE42-NEXT: pminud %xmm1, %xmm0 +; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: test_reduce_v16i32: @@ -1418,8 +1418,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1491,13 +1491,13 @@ ; X64-SSE42-LABEL: test_reduce_v16i32: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminud %xmm3, %xmm1 -; X64-SSE42-NEXT: pminud %xmm2, %xmm1 -; X64-SSE42-NEXT: pminud %xmm0, %xmm1 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE42-NEXT: pminud %xmm2, %xmm0 ; X64-SSE42-NEXT: pminud %xmm1, %xmm0 -; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE42-NEXT: pminud %xmm0, %xmm1 -; X64-SSE42-NEXT: movd %xmm1, %eax +; X64-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSE42-NEXT: pminud %xmm1, %xmm0 +; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: test_reduce_v16i32: @@ -1505,8 +1505,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1589,9 +1589,9 @@ ; X86-SSE42-LABEL: test_reduce_v32i16: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminuw %xmm3, %xmm1 -; X86-SSE42-NEXT: pminuw %xmm2, %xmm1 -; X86-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X86-SSE42-NEXT: pminuw %xmm2, %xmm0 +; X86-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 ; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X86-SSE42-NEXT: retl @@ -1601,8 +1601,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax ; X86-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax @@ -1651,9 +1651,9 @@ ; X64-SSE42-LABEL: test_reduce_v32i16: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminuw %xmm3, %xmm1 -; X64-SSE42-NEXT: pminuw %xmm2, %xmm1 -; X64-SSE42-NEXT: pminuw %xmm0, %xmm1 -; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 +; X64-SSE42-NEXT: pminuw %xmm2, %xmm0 +; X64-SSE42-NEXT: pminuw %xmm1, %xmm0 +; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 ; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def $ax killed $ax killed $eax ; X64-SSE42-NEXT: retq @@ -1663,8 +1663,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; X64-AVX1-NEXT: vmovd %xmm0, %eax ; X64-AVX1-NEXT: ## kill: def $ax killed $ax killed $eax @@ -1716,31 +1716,31 @@ ; X86-SSE2-LABEL: test_reduce_v64i8: ; X86-SSE2: ## %bb.0: ; X86-SSE2-NEXT: pminub %xmm3, %xmm1 -; X86-SSE2-NEXT: pminub %xmm2, %xmm1 -; X86-SSE2-NEXT: pminub %xmm0, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X86-SSE2-NEXT: pminub %xmm2, %xmm0 ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 -; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT: psrld $16, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X86-SSE2-NEXT: pminub %xmm1, %xmm0 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE2-NEXT: psrlw $8, %xmm1 +; X86-SSE2-NEXT: psrld $16, %xmm1 ; X86-SSE2-NEXT: pminub %xmm0, %xmm1 -; X86-SSE2-NEXT: movd %xmm1, %eax +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: psrlw $8, %xmm0 +; X86-SSE2-NEXT: pminub %xmm1, %xmm0 +; X86-SSE2-NEXT: movd %xmm0, %eax ; X86-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: test_reduce_v64i8: ; X86-SSE42: ## %bb.0: ; X86-SSE42-NEXT: pminub %xmm3, %xmm1 -; X86-SSE42-NEXT: pminub %xmm2, %xmm1 -; X86-SSE42-NEXT: pminub %xmm0, %xmm1 -; X86-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT: psrlw $8, %xmm0 +; X86-SSE42-NEXT: pminub %xmm2, %xmm0 ; X86-SSE42-NEXT: pminub %xmm1, %xmm0 -; X86-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X86-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE42-NEXT: psrlw $8, %xmm1 +; X86-SSE42-NEXT: pminub %xmm0, %xmm1 +; X86-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X86-SSE42-NEXT: movd %xmm0, %eax ; X86-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X86-SSE42-NEXT: retl @@ -1750,8 +1750,8 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 -; X86-AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X86-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -1776,31 +1776,31 @@ ; X64-SSE2-LABEL: test_reduce_v64i8: ; X64-SSE2: ## %bb.0: ; X64-SSE2-NEXT: pminub %xmm3, %xmm1 -; X64-SSE2-NEXT: pminub %xmm2, %xmm1 -; X64-SSE2-NEXT: pminub %xmm0, %xmm1 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; X64-SSE2-NEXT: pminub %xmm2, %xmm0 ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 -; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 -; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE2-NEXT: psrld $16, %xmm0 +; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; X64-SSE2-NEXT: pminub %xmm1, %xmm0 ; X64-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X64-SSE2-NEXT: psrlw $8, %xmm1 +; X64-SSE2-NEXT: psrld $16, %xmm1 ; X64-SSE2-NEXT: pminub %xmm0, %xmm1 -; X64-SSE2-NEXT: movd %xmm1, %eax +; X64-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X64-SSE2-NEXT: psrlw $8, %xmm0 +; X64-SSE2-NEXT: pminub %xmm1, %xmm0 +; X64-SSE2-NEXT: movd %xmm0, %eax ; X64-SSE2-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: test_reduce_v64i8: ; X64-SSE42: ## %bb.0: ; X64-SSE42-NEXT: pminub %xmm3, %xmm1 -; X64-SSE42-NEXT: pminub %xmm2, %xmm1 -; X64-SSE42-NEXT: pminub %xmm0, %xmm1 -; X64-SSE42-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT: psrlw $8, %xmm0 +; X64-SSE42-NEXT: pminub %xmm2, %xmm0 ; X64-SSE42-NEXT: pminub %xmm1, %xmm0 -; X64-SSE42-NEXT: phminposuw %xmm0, %xmm0 +; X64-SSE42-NEXT: movdqa %xmm0, %xmm1 +; X64-SSE42-NEXT: psrlw $8, %xmm1 +; X64-SSE42-NEXT: pminub %xmm0, %xmm1 +; X64-SSE42-NEXT: phminposuw %xmm1, %xmm0 ; X64-SSE42-NEXT: movd %xmm0, %eax ; X64-SSE42-NEXT: ## kill: def $al killed $al killed $eax ; X64-SSE42-NEXT: retq @@ -1810,8 +1810,8 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X64-AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 -; X64-AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; X64-AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vphminposuw %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -656,11 +656,11 @@ ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1] -; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5 +; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] +; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm1 ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6 ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6 ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3] ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0] @@ -703,15 +703,15 @@ ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0] -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-SLOW-NEXT: retq ; @@ -727,14 +727,14 @@ ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-FAST-NEXT: retq ; @@ -758,9 +758,9 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32: @@ -781,9 +781,9 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] ; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll --- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll @@ -28,7 +28,8 @@ ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: orl %ecx, %ebx ; X86-NEXT: movl %esi, %ebp -; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: orl %edx, %ebp +; X86-NEXT: orl %ecx, %ebp ; X86-NEXT: shrdl $28, %ebx, %ebp ; X86-NEXT: jne .LBB0_1 ; X86-NEXT: # %bb.2: # %exit diff --git a/llvm/test/CodeGen/X86/imul.ll b/llvm/test/CodeGen/X86/imul.ll --- a/llvm/test/CodeGen/X86/imul.ll +++ b/llvm/test/CodeGen/X86/imul.ll @@ -450,18 +450,13 @@ ; ; X86-LABEL: test6: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shll $5, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shll $5, %ecx +; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $33, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: popl %esi -; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl entry: %tmp3 = mul i64 %a, 33 diff --git a/llvm/test/CodeGen/X86/lea-opt-cse4.ll b/llvm/test/CodeGen/X86/lea-opt-cse4.ll --- a/llvm/test/CodeGen/X86/lea-opt-cse4.ll +++ b/llvm/test/CodeGen/X86/lea-opt-cse4.ll @@ -73,11 +73,9 @@ ; X64-NEXT: leal 1(%rax,%rcx), %ecx ; X64-NEXT: leal (%rax,%rax), %edx ; X64-NEXT: addl %eax, %edx -; X64-NEXT: addl %eax, %edx -; X64-NEXT: addl %eax, %edx -; X64-NEXT: addl %eax, %edx -; X64-NEXT: addl %ecx, %edx -; X64-NEXT: movl %edx, 16(%rdi) +; X64-NEXT: addl %edx, %ecx +; X64-NEXT: addl %edx, %ecx +; X64-NEXT: movl %ecx, 16(%rdi) ; X64-NEXT: retq ; ; X86-LABEL: foo_loop: @@ -104,11 +102,9 @@ ; X86-NEXT: leal 1(%ecx,%esi), %edx ; X86-NEXT: leal (%ecx,%ecx), %esi ; X86-NEXT: addl %ecx, %esi -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl %esi, 16(%eax) +; X86-NEXT: addl %esi, %edx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: movl %edx, 16(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/lea-opt2.ll b/llvm/test/CodeGen/X86/lea-opt2.ll --- a/llvm/test/CodeGen/X86/lea-opt2.ll +++ b/llvm/test/CodeGen/X86/lea-opt2.ll @@ -35,11 +35,13 @@ define i32 @test2(ptr %p, i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: addl %esi, %ecx ; CHECK-NEXT: movl %ecx, (%rdi) ; CHECK-NEXT: subl %edx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq entry: %0 = add i32 %a, %b @@ -53,11 +55,13 @@ define i32 @test3(ptr %p, i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: test3: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: addl %eax, %ecx ; CHECK-NEXT: addl %edx, %ecx -; CHECK-NEXT: addl %esi, %ecx ; CHECK-NEXT: movl %ecx, (%rdi) ; CHECK-NEXT: subl %edx, %eax +; CHECK-NEXT: # kill: def $eax killed $eax killed $rax ; CHECK-NEXT: retq entry: %0 = add i32 %a, %b @@ -110,8 +114,8 @@ ; CHECK-LABEL: test6: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: addq %rdx, %rcx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movq %rcx, (%rdi) ; CHECK-NEXT: subq %rdx, %rax ; CHECK-NEXT: retq @@ -128,8 +132,8 @@ ; CHECK-LABEL: test7: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: addq %rdx, %rcx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movq %rcx, (%rdi) ; CHECK-NEXT: subq %rdx, %rax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/logic-shift.ll b/llvm/test/CodeGen/X86/logic-shift.ll --- a/llvm/test/CodeGen/X86/logic-shift.ll +++ b/llvm/test/CodeGen/X86/logic-shift.ll @@ -234,8 +234,8 @@ ; CHECK-NEXT: sarq %cl, %rdi ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shrq %cl, %rsi -; CHECK-NEXT: orq %rsi, %rax ; CHECK-NEXT: orq %rdi, %rax +; CHECK-NEXT: orq %rsi, %rax ; CHECK-NEXT: retq %sh1 = ashr i64 %x0, %y %sh2 = lshr i64 %x1, %y @@ -255,8 +255,8 @@ ; CHECK-NEXT: shrq %cl, %rdi ; CHECK-NEXT: movl %r8d, %ecx ; CHECK-NEXT: shrq %cl, %rsi -; CHECK-NEXT: orq %rsi, %rax ; CHECK-NEXT: orq %rdi, %rax +; CHECK-NEXT: orq %rsi, %rax ; CHECK-NEXT: retq %sh1 = lshr i64 %x0, %y %sh2 = lshr i64 %x1, %w @@ -518,8 +518,8 @@ ; CHECK-NEXT: sarq %cl, %rdi ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: shrq %cl, %rsi -; CHECK-NEXT: xorq %rsi, %rax ; CHECK-NEXT: xorq %rdi, %rax +; CHECK-NEXT: xorq %rsi, %rax ; CHECK-NEXT: retq %sh1 = ashr i64 %x0, %y %sh2 = lshr i64 %x1, %y @@ -539,8 +539,8 @@ ; CHECK-NEXT: shrq %cl, %rdi ; CHECK-NEXT: movl %r8d, %ecx ; CHECK-NEXT: shrq %cl, %rsi -; CHECK-NEXT: xorq %rsi, %rax ; CHECK-NEXT: xorq %rdi, %rax +; CHECK-NEXT: xorq %rsi, %rax ; CHECK-NEXT: retq %sh1 = lshr i64 %x0, %y %sh2 = lshr i64 %x1, %w @@ -802,8 +802,8 @@ ; CHECK-NEXT: shrq %cl, %rdi ; CHECK-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NEXT: sarq %cl, %rsi -; CHECK-NEXT: andq %rsi, %rax ; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: andq %rsi, %rax ; CHECK-NEXT: retq %sh1 = lshr i64 %x0, %y %sh2 = ashr i64 %x1, %y @@ -823,8 +823,8 @@ ; CHECK-NEXT: shrq %cl, %rdi ; CHECK-NEXT: movl %r8d, %ecx ; CHECK-NEXT: shrq %cl, %rsi -; CHECK-NEXT: andq %rsi, %rax ; CHECK-NEXT: andq %rdi, %rax +; CHECK-NEXT: andq %rsi, %rax ; CHECK-NEXT: retq %sh1 = lshr i64 %x0, %y %sh2 = lshr i64 %x1, %w diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll --- a/llvm/test/CodeGen/X86/machine-cp.ll +++ b/llvm/test/CodeGen/X86/machine-cp.ll @@ -99,62 +99,56 @@ define <16 x float> @foo(<16 x float> %x) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ## %bb -; CHECK-NEXT: movaps %xmm3, %xmm9 -; CHECK-NEXT: movaps %xmm2, %xmm5 +; CHECK-NEXT: xorps %xmm5, %xmm5 +; CHECK-NEXT: cvttps2dq %xmm3, %xmm7 +; CHECK-NEXT: movaps %xmm3, %xmm4 +; CHECK-NEXT: cmpltps %xmm5, %xmm4 +; CHECK-NEXT: movaps {{.*#+}} xmm8 = [13,14,15,16] +; CHECK-NEXT: movaps %xmm4, %xmm6 +; CHECK-NEXT: orps %xmm8, %xmm6 +; CHECK-NEXT: cvtdq2ps %xmm7, %xmm3 +; CHECK-NEXT: andps %xmm8, %xmm3 +; CHECK-NEXT: andps %xmm6, %xmm3 +; CHECK-NEXT: andnps %xmm4, %xmm6 +; CHECK-NEXT: cvttps2dq %xmm2, %xmm4 +; CHECK-NEXT: movaps %xmm2, %xmm7 +; CHECK-NEXT: cmpltps %xmm5, %xmm7 +; CHECK-NEXT: movaps {{.*#+}} xmm8 = [9,10,11,12] +; CHECK-NEXT: movaps %xmm7, %xmm9 +; CHECK-NEXT: orps %xmm8, %xmm9 +; CHECK-NEXT: cvtdq2ps %xmm4, %xmm2 +; CHECK-NEXT: andps %xmm8, %xmm2 +; CHECK-NEXT: andps %xmm9, %xmm2 +; CHECK-NEXT: andnps %xmm7, %xmm9 +; CHECK-NEXT: cvttps2dq %xmm1, %xmm4 +; CHECK-NEXT: cmpltps %xmm5, %xmm1 +; CHECK-NEXT: movaps {{.*#+}} xmm7 = [5,6,7,8] +; CHECK-NEXT: movaps %xmm1, %xmm8 +; CHECK-NEXT: orps %xmm7, %xmm8 +; CHECK-NEXT: cvtdq2ps %xmm4, %xmm4 +; CHECK-NEXT: andps %xmm7, %xmm4 +; CHECK-NEXT: andps %xmm8, %xmm4 +; CHECK-NEXT: andnps %xmm1, %xmm8 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: cmpltps %xmm5, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm5 = [1,2,3,4] ; CHECK-NEXT: movaps %xmm0, %xmm7 -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: movaps %xmm3, %xmm2 -; CHECK-NEXT: cmpltps %xmm0, %xmm2 -; CHECK-NEXT: movaps %xmm2, %xmm4 -; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-NEXT: movaps %xmm4, %xmm8 -; CHECK-NEXT: andnps %xmm2, %xmm8 -; CHECK-NEXT: movaps %xmm5, %xmm6 -; CHECK-NEXT: cmpltps %xmm0, %xmm6 -; CHECK-NEXT: movaps {{.*#+}} xmm11 = [9,10,11,12] -; CHECK-NEXT: movaps %xmm6, %xmm2 -; CHECK-NEXT: orps %xmm11, %xmm2 -; CHECK-NEXT: movaps %xmm2, %xmm10 -; CHECK-NEXT: andnps %xmm6, %xmm10 -; CHECK-NEXT: cvttps2dq %xmm1, %xmm12 -; CHECK-NEXT: cmpltps %xmm0, %xmm1 -; CHECK-NEXT: movaps {{.*#+}} xmm13 = [5,6,7,8] -; CHECK-NEXT: movaps %xmm1, %xmm6 -; CHECK-NEXT: orps %xmm13, %xmm6 -; CHECK-NEXT: movaps %xmm6, %xmm14 -; CHECK-NEXT: andnps %xmm1, %xmm14 -; CHECK-NEXT: cvttps2dq %xmm7, %xmm3 -; CHECK-NEXT: cmpltps %xmm0, %xmm7 -; CHECK-NEXT: movaps {{.*#+}} xmm15 = [1,2,3,4] -; CHECK-NEXT: movaps %xmm7, %xmm0 -; CHECK-NEXT: orps %xmm15, %xmm0 -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: andnps %xmm7, %xmm1 -; CHECK-NEXT: andps %xmm15, %xmm0 -; CHECK-NEXT: cvtdq2ps %xmm3, %xmm3 -; CHECK-NEXT: andps %xmm3, %xmm0 -; CHECK-NEXT: movaps {{.*#+}} xmm3 = [1,1,1,1] -; CHECK-NEXT: andps %xmm3, %xmm1 -; CHECK-NEXT: orps %xmm1, %xmm0 -; CHECK-NEXT: andps %xmm13, %xmm6 -; CHECK-NEXT: cvtdq2ps %xmm12, %xmm1 -; CHECK-NEXT: andps %xmm1, %xmm6 -; CHECK-NEXT: andps %xmm3, %xmm14 -; CHECK-NEXT: orps %xmm14, %xmm6 -; CHECK-NEXT: andps %xmm11, %xmm2 -; CHECK-NEXT: cvttps2dq %xmm5, %xmm1 +; CHECK-NEXT: orps %xmm5, %xmm7 ; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1 -; CHECK-NEXT: andps %xmm1, %xmm2 -; CHECK-NEXT: andps %xmm3, %xmm10 -; CHECK-NEXT: orps %xmm10, %xmm2 -; CHECK-NEXT: andps %xmm3, %xmm8 -; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-NEXT: cvttps2dq %xmm9, %xmm1 -; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1 -; CHECK-NEXT: andps %xmm1, %xmm4 +; CHECK-NEXT: andps %xmm5, %xmm1 +; CHECK-NEXT: andps %xmm7, %xmm1 +; CHECK-NEXT: andnps %xmm0, %xmm7 +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: andps %xmm0, %xmm7 +; CHECK-NEXT: orps %xmm7, %xmm1 +; CHECK-NEXT: andps %xmm0, %xmm8 ; CHECK-NEXT: orps %xmm8, %xmm4 -; CHECK-NEXT: movaps %xmm6, %xmm1 -; CHECK-NEXT: movaps %xmm4, %xmm3 +; CHECK-NEXT: andps %xmm0, %xmm9 +; CHECK-NEXT: orps %xmm9, %xmm2 +; CHECK-NEXT: andps %xmm0, %xmm6 +; CHECK-NEXT: orps %xmm6, %xmm3 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm4, %xmm1 ; CHECK-NEXT: retq bb: %v3 = icmp slt <16 x i32> , zeroinitializer diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -249,8 +249,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -384,8 +384,8 @@ ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -398,8 +398,8 @@ ; AVX1-NEXT: movl %edx, %eax ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB3_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 @@ -411,28 +411,28 @@ ; AVX1-NEXT: vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4 ; AVX1-NEXT: vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5 ; AVX1-NEXT: vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB3_1 ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -831,8 +831,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -980,8 +980,8 @@ ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -994,8 +994,8 @@ ; AVX1-NEXT: movl %edx, %eax ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB7_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1011,28 +1011,28 @@ ; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7 ; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: addq $32, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB7_1 ; AVX1-NEXT: # %bb.2: # %middle.block ; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1457,8 +1457,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1625,8 +1625,8 @@ ; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1639,9 +1639,9 @@ ; AVX1-NEXT: movl %edx, %eax ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: xorl %ecx, %ecx -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB11_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 @@ -1669,37 +1669,37 @@ ; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm10 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX1-NEXT: vpmulld %xmm11, %xmm12, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm12 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 ; AVX1-NEXT: vpaddd %xmm4, %xmm12, %xmm4 -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm4 ; AVX1-NEXT: vpaddd %xmm0, %xmm7, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4 -; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm4 -; AVX1-NEXT: vpaddd %xmm3, %xmm11, %xmm3 +; AVX1-NEXT: vpaddd %xmm4, %xmm8, %xmm4 +; AVX1-NEXT: vpaddd %xmm3, %xmm9, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm10, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm11, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm5 +; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -2661,17 +2661,17 @@ ; SSE2-NEXT: movdqu (%rdx), %xmm0 ; SSE2-NEXT: movdqu (%rcx), %xmm2 ; SSE2-NEXT: pmaddwd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 ; SSE2-NEXT: movdqu (%r8), %xmm0 -; SSE2-NEXT: movdqu (%r9), %xmm3 -; SSE2-NEXT: pmaddwd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqu (%r10), %xmm0 -; SSE2-NEXT: movdqu (%rax), %xmm1 +; SSE2-NEXT: movdqu (%r9), %xmm1 ; SSE2-NEXT: pmaddwd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movdqu (%r10), %xmm0 +; SSE2-NEXT: movdqu (%rax), %xmm2 +; SSE2-NEXT: pmaddwd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -2685,13 +2685,13 @@ ; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu (%rdx), %xmm1 ; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vmovdqu (%r8), %xmm2 -; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2 -; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%r10), %xmm2 -; AVX-NEXT: vpmaddwd (%rax), %xmm2, %xmm2 -; AVX-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovdqu (%r8), %xmm1 +; AVX-NEXT: vpmaddwd (%r9), %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqu (%r10), %xmm1 +; AVX-NEXT: vpmaddwd (%rax), %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -3163,8 +3163,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -1754,8 +1754,8 @@ ; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm2 {%k2} ; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] ; AVX512F-NEXT: vpgatherdd c(,%zmm0), %zmm1 {%k1} -; AVX512F-NEXT: vpaddd %ymm1, %ymm1, %ymm0 -; AVX512F-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpaddd %ymm1, %ymm2, %ymm0 +; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: gather_v8i32_v8i32: @@ -1768,8 +1768,8 @@ ; AVX512VL-NEXT: vpgatherdd c(,%ymm1), %ymm2 {%k2} ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [28,28,28,28,28,28,28,28] ; AVX512VL-NEXT: vpgatherdd c(,%ymm1), %ymm0 {%k1} -; AVX512VL-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm1 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq %1 = icmp eq <8 x i32> %trigger, zeroinitializer %2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> getelementptr (%struct.a, <8 x ptr> , <8 x i64> zeroinitializer, i32 0, <8 x i64> ), i32 4, <8 x i1> %1, <8 x i32> undef) diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -499,13 +499,13 @@ ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824] ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 +; KNL_64-NEXT: vpaddq %zmm4, %zmm2, %zmm2 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 -; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -520,8 +520,8 @@ ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820] ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 -; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1 +; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: movw $255, %ax ; KNL_32-NEXT: kmovw %eax, %k1 @@ -533,10 +533,10 @@ ; SKX_SMALL: # %bb.0: # %entry ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 -; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm1 +; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} @@ -550,8 +550,8 @@ ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 -; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm1 +; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} @@ -583,13 +583,13 @@ ; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 ; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824] ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 +; KNL_64-NEXT: vpaddq %zmm4, %zmm2, %zmm2 ; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 ; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 ; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 ; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 -; KNL_64-NEXT: vpaddq %zmm1, %zmm4, %zmm1 ; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 ; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -604,8 +604,8 @@ ; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 ; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820] ; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 -; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm1 +; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; KNL_32-NEXT: movw $255, %ax ; KNL_32-NEXT: kmovw %eax, %k1 @@ -617,10 +617,10 @@ ; SKX_SMALL: # %bb.0: # %entry ; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 ; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero ; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 -; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm1 +; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 ; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} @@ -634,8 +634,8 @@ ; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 ; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 -; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm1 +; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 ; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 ; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} @@ -3496,7 +3496,7 @@ ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1} ; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0 -; SKX-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_pr28312: @@ -3513,7 +3513,7 @@ ; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1} ; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0 -; SKX_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; SKX_32-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp ; SKX_32-NEXT: .cfi_def_cfa %esp, 4 diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll @@ -1615,12 +1615,12 @@ ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 -; X86-SSE2-NEXT: pand %xmm0, %xmm3 -; X86-SSE2-NEXT: pand %xmm2, %xmm3 -; X86-SSE2-NEXT: pmovmskb %xmm3, %eax +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pmovmskb %xmm2, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: retl @@ -1635,12 +1635,12 @@ ; X86-SSE41-NEXT: pxor %xmm0, %xmm2 ; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 ; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE41-NEXT: pxor %xmm1, %xmm3 -; X86-SSE41-NEXT: por %xmm0, %xmm3 -; X86-SSE41-NEXT: por %xmm2, %xmm3 -; X86-SSE41-NEXT: ptest %xmm3, %xmm3 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm1, %xmm2 +; X86-SSE41-NEXT: por %xmm0, %xmm2 +; X86-SSE41-NEXT: ptest %xmm2, %xmm2 ; X86-SSE41-NEXT: sete %al ; X86-SSE41-NEXT: retl %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind @@ -1713,12 +1713,12 @@ ; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 ; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 -; X86-SSE2-NEXT: pand %xmm0, %xmm3 -; X86-SSE2-NEXT: pand %xmm2, %xmm3 -; X86-SSE2-NEXT: pmovmskb %xmm3, %eax +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm2 +; X86-SSE2-NEXT: pand %xmm0, %xmm2 +; X86-SSE2-NEXT: pmovmskb %xmm2, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al ; X86-SSE2-NEXT: retl @@ -1733,12 +1733,12 @@ ; X86-SSE41-NEXT: pxor %xmm0, %xmm2 ; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE41-NEXT: pxor %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 ; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE41-NEXT: pxor %xmm1, %xmm3 -; X86-SSE41-NEXT: por %xmm0, %xmm3 -; X86-SSE41-NEXT: por %xmm2, %xmm3 -; X86-SSE41-NEXT: ptest %xmm3, %xmm3 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm1, %xmm2 +; X86-SSE41-NEXT: por %xmm0, %xmm2 +; X86-SSE41-NEXT: ptest %xmm2, %xmm2 ; X86-SSE41-NEXT: sete %al ; X86-SSE41-NEXT: retl %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 48) nounwind @@ -1777,8 +1777,8 @@ ; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: pand %xmm1, %xmm2 ; X86-SSE2-NEXT: pand %xmm0, %xmm2 ; X86-SSE2-NEXT: pmovmskb %xmm2, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF @@ -1793,8 +1793,8 @@ ; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE41-NEXT: por %xmm1, %xmm0 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE41-NEXT: por %xmm1, %xmm2 ; X86-SSE41-NEXT: por %xmm0, %xmm2 ; X86-SSE41-NEXT: ptest %xmm2, %xmm2 ; X86-SSE41-NEXT: setne %al @@ -1844,22 +1844,22 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm2 -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm2 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqu 47(%ecx), %xmm2 -; X86-SSE2-NEXT: movdqu 47(%eax), %xmm4 -; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm4 -; X86-SSE2-NEXT: pand %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm1, %xmm4 -; X86-SSE2-NEXT: pand %xmm0, %xmm4 -; X86-SSE2-NEXT: pmovmskb %xmm4, %eax +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqu 47(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 47(%eax), %xmm3 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: pmovmskb %xmm3, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl @@ -1868,22 +1868,22 @@ ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm2 -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: pxor %xmm2, %xmm1 -; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm2 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE41-NEXT: pxor %xmm2, %xmm3 -; X86-SSE41-NEXT: movdqu 47(%ecx), %xmm2 -; X86-SSE41-NEXT: movdqu 47(%eax), %xmm4 -; X86-SSE41-NEXT: pxor %xmm2, %xmm4 -; X86-SSE41-NEXT: por %xmm3, %xmm4 -; X86-SSE41-NEXT: por %xmm1, %xmm4 -; X86-SSE41-NEXT: por %xmm0, %xmm4 -; X86-SSE41-NEXT: ptest %xmm4, %xmm4 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm1, %xmm2 +; X86-SSE41-NEXT: movdqu 47(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 47(%eax), %xmm3 +; X86-SSE41-NEXT: pxor %xmm1, %xmm3 +; X86-SSE41-NEXT: por %xmm2, %xmm3 +; X86-SSE41-NEXT: por %xmm0, %xmm3 +; X86-SSE41-NEXT: ptest %xmm3, %xmm3 ; X86-SSE41-NEXT: setne %al ; X86-SSE41-NEXT: retl %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 63) nounwind @@ -1957,9 +1957,9 @@ ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al @@ -1976,9 +1976,9 @@ ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE41-NEXT: por %xmm3, %xmm2 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: por %xmm2, %xmm1 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 ; X86-SSE41-NEXT: ptest %xmm0, %xmm0 ; X86-SSE41-NEXT: sete %al ; X86-SSE41-NEXT: retl @@ -2027,22 +2027,22 @@ ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movdqu (%ecx), %xmm1 -; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm2 -; X86-SSE2-NEXT: movdqu (%eax), %xmm0 +; X86-SSE2-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu (%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm0, %xmm2 +; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm1 -; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm2 -; X86-SSE2-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqu 48(%ecx), %xmm2 -; X86-SSE2-NEXT: movdqu 48(%eax), %xmm4 -; X86-SSE2-NEXT: pcmpeqb %xmm2, %xmm4 -; X86-SSE2-NEXT: pand %xmm3, %xmm4 -; X86-SSE2-NEXT: pand %xmm1, %xmm4 -; X86-SSE2-NEXT: pand %xmm0, %xmm4 -; X86-SSE2-NEXT: pmovmskb %xmm4, %eax +; X86-SSE2-NEXT: pand %xmm2, %xmm0 +; X86-SSE2-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm2 +; X86-SSE2-NEXT: movdqu 48(%ecx), %xmm1 +; X86-SSE2-NEXT: movdqu 48(%eax), %xmm3 +; X86-SSE2-NEXT: pcmpeqb %xmm1, %xmm3 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: pmovmskb %xmm3, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: retl @@ -2051,22 +2051,22 @@ ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE41-NEXT: movdqu (%ecx), %xmm1 -; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm2 -; X86-SSE41-NEXT: movdqu (%eax), %xmm0 +; X86-SSE41-NEXT: movdqu (%ecx), %xmm0 +; X86-SSE41-NEXT: movdqu 16(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu (%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm0, %xmm2 +; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0 ; X86-SSE41-NEXT: pxor %xmm1, %xmm0 -; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1 -; X86-SSE41-NEXT: pxor %xmm2, %xmm1 -; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm2 -; X86-SSE41-NEXT: movdqu 32(%eax), %xmm3 -; X86-SSE41-NEXT: pxor %xmm2, %xmm3 -; X86-SSE41-NEXT: movdqu 48(%ecx), %xmm2 -; X86-SSE41-NEXT: movdqu 48(%eax), %xmm4 -; X86-SSE41-NEXT: pxor %xmm2, %xmm4 -; X86-SSE41-NEXT: por %xmm3, %xmm4 -; X86-SSE41-NEXT: por %xmm1, %xmm4 -; X86-SSE41-NEXT: por %xmm0, %xmm4 -; X86-SSE41-NEXT: ptest %xmm4, %xmm4 +; X86-SSE41-NEXT: por %xmm2, %xmm0 +; X86-SSE41-NEXT: movdqu 32(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2 +; X86-SSE41-NEXT: pxor %xmm1, %xmm2 +; X86-SSE41-NEXT: movdqu 48(%ecx), %xmm1 +; X86-SSE41-NEXT: movdqu 48(%eax), %xmm3 +; X86-SSE41-NEXT: pxor %xmm1, %xmm3 +; X86-SSE41-NEXT: por %xmm2, %xmm3 +; X86-SSE41-NEXT: por %xmm0, %xmm3 +; X86-SSE41-NEXT: ptest %xmm3, %xmm3 ; X86-SSE41-NEXT: setne %al ; X86-SSE41-NEXT: retl %call = tail call i32 @memcmp(ptr %x, ptr %y, i32 64) nounwind @@ -2140,9 +2140,9 @@ ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE2-NEXT: pand %xmm2, %xmm1 ; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pand %xmm2, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X86-SSE2-NEXT: sete %al @@ -2159,9 +2159,9 @@ ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 ; X86-SSE41-NEXT: por %xmm3, %xmm2 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 -; X86-SSE41-NEXT: por %xmm2, %xmm1 ; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE41-NEXT: por %xmm1, %xmm0 +; X86-SSE41-NEXT: por %xmm2, %xmm0 ; X86-SSE41-NEXT: ptest %xmm0, %xmm0 ; X86-SSE41-NEXT: sete %al ; X86-SSE41-NEXT: retl diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -1607,10 +1607,10 @@ ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm3 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 ; X64-SSE2-NEXT: movdqu 32(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm2, %xmm1 ; X64-SSE2-NEXT: pand %xmm0, %xmm1 -; X64-SSE2-NEXT: pand %xmm3, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: sete %al @@ -1625,10 +1625,10 @@ ; X64-SSE41-NEXT: pxor %xmm0, %xmm3 ; X64-SSE41-NEXT: movdqu 16(%rsi), %xmm0 ; X64-SSE41-NEXT: pxor %xmm1, %xmm0 +; X64-SSE41-NEXT: por %xmm3, %xmm0 ; X64-SSE41-NEXT: movdqu 32(%rsi), %xmm1 ; X64-SSE41-NEXT: pxor %xmm2, %xmm1 ; X64-SSE41-NEXT: por %xmm0, %xmm1 -; X64-SSE41-NEXT: por %xmm3, %xmm1 ; X64-SSE41-NEXT: ptest %xmm1, %xmm1 ; X64-SSE41-NEXT: sete %al ; X64-SSE41-NEXT: retq @@ -1729,10 +1729,10 @@ ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm3 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm3, %xmm0 ; X64-SSE2-NEXT: movdqu 32(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm2, %xmm1 ; X64-SSE2-NEXT: pand %xmm0, %xmm1 -; X64-SSE2-NEXT: pand %xmm3, %xmm1 ; X64-SSE2-NEXT: pmovmskb %xmm1, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: sete %al @@ -1747,10 +1747,10 @@ ; X64-SSE41-NEXT: pxor %xmm0, %xmm3 ; X64-SSE41-NEXT: movdqu 16(%rsi), %xmm0 ; X64-SSE41-NEXT: pxor %xmm1, %xmm0 +; X64-SSE41-NEXT: por %xmm3, %xmm0 ; X64-SSE41-NEXT: movdqu 32(%rsi), %xmm1 ; X64-SSE41-NEXT: pxor %xmm2, %xmm1 ; X64-SSE41-NEXT: por %xmm0, %xmm1 -; X64-SSE41-NEXT: por %xmm3, %xmm1 ; X64-SSE41-NEXT: ptest %xmm1, %xmm1 ; X64-SSE41-NEXT: sete %al ; X64-SSE41-NEXT: retq @@ -1762,8 +1762,8 @@ ; X64-AVX-NEXT: vmovdqu 32(%rdi), %xmm2 ; X64-AVX-NEXT: vpxor 16(%rsi), %xmm1, %xmm1 ; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; X64-AVX-NEXT: vpxor 32(%rsi), %xmm2, %xmm2 -; X64-AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 +; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpxor 32(%rsi), %xmm2, %xmm1 ; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vptest %xmm0, %xmm0 ; X64-AVX-NEXT: sete %al @@ -1798,8 +1798,8 @@ ; X64-SSE2-NEXT: movdqu 32(%rdi), %xmm2 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE2-NEXT: pand %xmm1, %xmm0 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE2-NEXT: pand %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm0, %xmm2 ; X64-SSE2-NEXT: pmovmskb %xmm2, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF @@ -1813,8 +1813,8 @@ ; X64-SSE41-NEXT: movdqu 32(%rdi), %xmm2 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-SSE41-NEXT: por %xmm1, %xmm0 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; X64-SSE41-NEXT: por %xmm1, %xmm2 ; X64-SSE41-NEXT: por %xmm0, %xmm2 ; X64-SSE41-NEXT: ptest %xmm2, %xmm2 ; X64-SSE41-NEXT: setne %al @@ -1893,13 +1893,13 @@ ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm4 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: movdqu 32(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm2, %xmm1 ; X64-SSE2-NEXT: movdqu 47(%rsi), %xmm2 ; X64-SSE2-NEXT: pcmpeqb %xmm3, %xmm2 ; X64-SSE2-NEXT: pand %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm0, %xmm2 -; X64-SSE2-NEXT: pand %xmm4, %xmm2 ; X64-SSE2-NEXT: pmovmskb %xmm2, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al @@ -1915,13 +1915,13 @@ ; X64-SSE41-NEXT: pxor %xmm0, %xmm4 ; X64-SSE41-NEXT: movdqu 16(%rsi), %xmm0 ; X64-SSE41-NEXT: pxor %xmm1, %xmm0 +; X64-SSE41-NEXT: por %xmm4, %xmm0 ; X64-SSE41-NEXT: movdqu 32(%rsi), %xmm1 ; X64-SSE41-NEXT: pxor %xmm2, %xmm1 ; X64-SSE41-NEXT: movdqu 47(%rsi), %xmm2 ; X64-SSE41-NEXT: pxor %xmm3, %xmm2 ; X64-SSE41-NEXT: por %xmm1, %xmm2 ; X64-SSE41-NEXT: por %xmm0, %xmm2 -; X64-SSE41-NEXT: por %xmm4, %xmm2 ; X64-SSE41-NEXT: ptest %xmm2, %xmm2 ; X64-SSE41-NEXT: setne %al ; X64-SSE41-NEXT: retq @@ -2020,9 +2020,9 @@ ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; X64-SSE2-NEXT: pand %xmm3, %xmm2 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: sete %al @@ -2038,9 +2038,9 @@ ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; X64-SSE41-NEXT: por %xmm3, %xmm2 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE41-NEXT: por %xmm2, %xmm1 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE41-NEXT: por %xmm1, %xmm0 +; X64-SSE41-NEXT: por %xmm2, %xmm0 ; X64-SSE41-NEXT: ptest %xmm0, %xmm0 ; X64-SSE41-NEXT: sete %al ; X64-SSE41-NEXT: retq @@ -2118,13 +2118,13 @@ ; X64-SSE2-NEXT: pcmpeqb %xmm0, %xmm4 ; X64-SSE2-NEXT: movdqu 16(%rsi), %xmm0 ; X64-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm4, %xmm0 ; X64-SSE2-NEXT: movdqu 32(%rsi), %xmm1 ; X64-SSE2-NEXT: pcmpeqb %xmm2, %xmm1 ; X64-SSE2-NEXT: movdqu 48(%rsi), %xmm2 ; X64-SSE2-NEXT: pcmpeqb %xmm3, %xmm2 ; X64-SSE2-NEXT: pand %xmm1, %xmm2 ; X64-SSE2-NEXT: pand %xmm0, %xmm2 -; X64-SSE2-NEXT: pand %xmm4, %xmm2 ; X64-SSE2-NEXT: pmovmskb %xmm2, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: setne %al @@ -2140,13 +2140,13 @@ ; X64-SSE41-NEXT: pxor %xmm0, %xmm4 ; X64-SSE41-NEXT: movdqu 16(%rsi), %xmm0 ; X64-SSE41-NEXT: pxor %xmm1, %xmm0 +; X64-SSE41-NEXT: por %xmm4, %xmm0 ; X64-SSE41-NEXT: movdqu 32(%rsi), %xmm1 ; X64-SSE41-NEXT: pxor %xmm2, %xmm1 ; X64-SSE41-NEXT: movdqu 48(%rsi), %xmm2 ; X64-SSE41-NEXT: pxor %xmm3, %xmm2 ; X64-SSE41-NEXT: por %xmm1, %xmm2 ; X64-SSE41-NEXT: por %xmm0, %xmm2 -; X64-SSE41-NEXT: por %xmm4, %xmm2 ; X64-SSE41-NEXT: ptest %xmm2, %xmm2 ; X64-SSE41-NEXT: setne %al ; X64-SSE41-NEXT: retq @@ -2260,9 +2260,9 @@ ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; X64-SSE2-NEXT: pand %xmm3, %xmm2 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE2-NEXT: pand %xmm2, %xmm1 ; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: pand %xmm1, %xmm0 +; X64-SSE2-NEXT: pand %xmm2, %xmm0 ; X64-SSE2-NEXT: pmovmskb %xmm0, %eax ; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; X64-SSE2-NEXT: sete %al @@ -2278,9 +2278,9 @@ ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; X64-SSE41-NEXT: por %xmm3, %xmm2 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; X64-SSE41-NEXT: por %xmm2, %xmm1 ; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE41-NEXT: por %xmm1, %xmm0 +; X64-SSE41-NEXT: por %xmm2, %xmm0 ; X64-SSE41-NEXT: ptest %xmm0, %xmm0 ; X64-SSE41-NEXT: sete %al ; X64-SSE41-NEXT: retq @@ -2380,8 +2380,8 @@ ; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2 ; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 ; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 -; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2 -; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm1 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al @@ -2395,8 +2395,8 @@ ; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2 ; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 ; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2 -; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm1 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al @@ -2509,8 +2509,8 @@ ; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al @@ -2524,8 +2524,8 @@ ; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm1 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al @@ -2616,9 +2616,9 @@ ; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2 ; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 -; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 ; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: vzeroupper @@ -2634,9 +2634,9 @@ ; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2 ; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper @@ -2752,9 +2752,9 @@ ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: vzeroupper @@ -2770,9 +2770,9 @@ ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper @@ -2866,9 +2866,9 @@ ; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2 ; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1 -; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 ; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: setne %al ; X64-AVX1-NEXT: vzeroupper @@ -2884,9 +2884,9 @@ ; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2 ; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: setne %al ; X64-AVX2-NEXT: vzeroupper @@ -3002,9 +3002,9 @@ ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1 ; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; X64-AVX1-NEXT: vptest %ymm0, %ymm0 ; X64-AVX1-NEXT: sete %al ; X64-AVX1-NEXT: vzeroupper @@ -3020,9 +3020,9 @@ ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vptest %ymm0, %ymm0 ; X64-AVX2-NEXT: sete %al ; X64-AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -907,8 +907,8 @@ ; SSE2-NEXT: paddq %xmm3, %xmm4 ; SSE2-NEXT: psllq $32, %xmm4 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i64_signed_reg_reg: @@ -938,16 +938,16 @@ ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: psubq %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrlq $1, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlq $1, %xmm0 ; SSE41-NEXT: psrlq $33, %xmm1 ; SSE41-NEXT: pmuludq %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: psrlq $32, %xmm3 +; SSE41-NEXT: pmuludq %xmm0, %xmm3 +; SSE41-NEXT: paddq %xmm1, %xmm3 +; SSE41-NEXT: psllq $32, %xmm3 +; SSE41-NEXT: pmuludq %xmm4, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: retq @@ -967,8 +967,8 @@ ; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: retq ; ; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_reg: @@ -986,8 +986,8 @@ ; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; ; XOP-LABEL: vec128_i64_signed_reg_reg: @@ -1006,8 +1006,8 @@ ; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec128_i64_signed_reg_reg: @@ -1029,8 +1029,8 @@ ; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1065,8 +1065,8 @@ ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-FALLBACK-NEXT: vzeroupper ; AVX512BW-FALLBACK-NEXT: retq %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed @@ -1122,8 +1122,8 @@ ; SSE2-NEXT: paddq %xmm3, %xmm4 ; SSE2-NEXT: psllq $32, %xmm4 ; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: paddq %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i64_unsigned_reg_reg: @@ -1153,16 +1153,16 @@ ; SSE41-NEXT: movdqa %xmm3, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: psubq %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrlq $1, %xmm3 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlq $1, %xmm0 ; SSE41-NEXT: psrlq $33, %xmm1 ; SSE41-NEXT: pmuludq %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm4, %xmm3 +; SSE41-NEXT: movdqa %xmm4, %xmm3 +; SSE41-NEXT: psrlq $32, %xmm3 +; SSE41-NEXT: pmuludq %xmm0, %xmm3 +; SSE41-NEXT: paddq %xmm1, %xmm3 +; SSE41-NEXT: psllq $32, %xmm3 +; SSE41-NEXT: pmuludq %xmm4, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: retq @@ -1186,8 +1186,8 @@ ; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm3, %xmm1 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: retq ; ; AVX2-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg: @@ -1209,8 +1209,8 @@ ; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm3, %xmm1 ; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm5, %xmm2, %xmm2 -; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; ; XOP-LABEL: vec128_i64_unsigned_reg_reg: @@ -1229,8 +1229,8 @@ ; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec128_i64_unsigned_reg_reg: @@ -1252,8 +1252,8 @@ ; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1288,8 +1288,8 @@ ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-FALLBACK-NEXT: vzeroupper ; AVX512BW-FALLBACK-NEXT: retq %t3 = icmp ugt <2 x i64> %a1, %a2 @@ -1338,16 +1338,16 @@ ; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrlq $1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrlq $1, %xmm0 ; SSE2-NEXT: psrlq $33, %xmm3 ; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: paddq %xmm3, %xmm0 -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pmuludq %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrlq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm0, %xmm4 +; SSE2-NEXT: paddq %xmm3, %xmm4 +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 ; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: retq @@ -1380,16 +1380,16 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 ; SSE41-NEXT: psubq %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlq $1, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrlq $1, %xmm0 ; SSE41-NEXT: psrlq $33, %xmm1 ; SSE41-NEXT: pmuludq %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm0 -; SSE41-NEXT: paddq %xmm1, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm2 +; SSE41-NEXT: pmuludq %xmm0, %xmm2 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: psllq $32, %xmm2 +; SSE41-NEXT: pmuludq %xmm4, %xmm0 ; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1410,8 +1410,8 @@ ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-FALLBACK-NEXT: retq ; ; AVX2-FALLBACK-LABEL: vec128_i64_signed_mem_reg: @@ -1430,8 +1430,8 @@ ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; ; XOP-LABEL: vec128_i64_signed_mem_reg: @@ -1451,8 +1451,8 @@ ; XOP-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; XOP-NEXT: vpsllq $32, %xmm0, %xmm0 ; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec128_i64_signed_mem_reg: @@ -1474,8 +1474,8 @@ ; AVX512F-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1511,8 +1511,8 @@ ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm0, %xmm0 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX512BW-FALLBACK-NEXT: vzeroupper ; AVX512BW-FALLBACK-NEXT: retq %a1 = load <2 x i64>, ptr %a1_addr @@ -1570,8 +1570,8 @@ ; SSE2-NEXT: paddq %xmm2, %xmm4 ; SSE2-NEXT: psllq $32, %xmm4 ; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: paddq %xmm3, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: vec128_i64_signed_reg_mem: @@ -1601,16 +1601,16 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 ; SSE41-NEXT: psubq %xmm5, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: psrlq $1, %xmm2 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: psrlq $1, %xmm0 ; SSE41-NEXT: psrlq $33, %xmm3 ; SSE41-NEXT: pmuludq %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm0 -; SSE41-NEXT: paddq %xmm3, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm4, %xmm2 +; SSE41-NEXT: movdqa %xmm4, %xmm2 +; SSE41-NEXT: psrlq $32, %xmm2 +; SSE41-NEXT: pmuludq %xmm0, %xmm2 +; SSE41-NEXT: paddq %xmm3, %xmm2 +; SSE41-NEXT: psllq $32, %xmm2 +; SSE41-NEXT: pmuludq %xmm4, %xmm0 ; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: retq @@ -1631,8 +1631,8 @@ ; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: retq ; ; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_mem: @@ -1651,8 +1651,8 @@ ; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; ; XOP-LABEL: vec128_i64_signed_reg_mem: @@ -1672,8 +1672,8 @@ ; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec128_i64_signed_reg_mem: @@ -1695,8 +1695,8 @@ ; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1732,8 +1732,8 @@ ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-FALLBACK-NEXT: vzeroupper ; AVX512BW-FALLBACK-NEXT: retq %a2 = load <2 x i64>, ptr %a2_addr @@ -1782,16 +1782,16 @@ ; SSE2-NEXT: pandn %xmm0, %xmm3 ; SSE2-NEXT: por %xmm4, %xmm3 ; SSE2-NEXT: psubq %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrlq $1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: psrlq $1, %xmm0 ; SSE2-NEXT: psrlq $33, %xmm3 ; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: paddq %xmm3, %xmm0 -; SSE2-NEXT: psllq $32, %xmm0 -; SSE2-NEXT: pmuludq %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: psrlq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm0, %xmm4 +; SSE2-NEXT: paddq %xmm3, %xmm4 +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 ; SSE2-NEXT: paddq %xmm1, %xmm0 ; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: retq @@ -1824,16 +1824,16 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 ; SSE41-NEXT: psubq %xmm5, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: psrlq $1, %xmm1 +; SSE41-NEXT: movdqa %xmm3, %xmm0 +; SSE41-NEXT: psrlq $1, %xmm0 ; SSE41-NEXT: psrlq $33, %xmm3 ; SSE41-NEXT: pmuludq %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm1, %xmm0 -; SSE41-NEXT: paddq %xmm3, %xmm0 -; SSE41-NEXT: psllq $32, %xmm0 -; SSE41-NEXT: pmuludq %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: psrlq $32, %xmm1 +; SSE41-NEXT: pmuludq %xmm0, %xmm1 +; SSE41-NEXT: paddq %xmm3, %xmm1 +; SSE41-NEXT: psllq $32, %xmm1 +; SSE41-NEXT: pmuludq %xmm4, %xmm0 ; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: paddq %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1855,8 +1855,8 @@ ; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: retq ; ; AVX2-FALLBACK-LABEL: vec128_i64_signed_mem_mem: @@ -1876,8 +1876,8 @@ ; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX2-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX2-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX2-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX2-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-FALLBACK-NEXT: retq ; ; XOP-LABEL: vec128_i64_signed_mem_mem: @@ -1898,8 +1898,8 @@ ; XOP-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; XOP-NEXT: vpsllq $32, %xmm1, %xmm1 ; XOP-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; XOP-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOP-NEXT: retq ; ; AVX512F-LABEL: vec128_i64_signed_mem_mem: @@ -1921,8 +1921,8 @@ ; AVX512F-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX512F-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512F-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1959,8 +1959,8 @@ ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm4, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512BW-FALLBACK-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512BW-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-FALLBACK-NEXT: vzeroupper ; AVX512BW-FALLBACK-NEXT: retq %a1 = load <2 x i64>, ptr %a1_addr @@ -2107,9 +2107,9 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psubusw %xmm1, %xmm3 ; SSE2-NEXT: psubusw %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: psubw %xmm0, %xmm3 ; SSE2-NEXT: paddw %xmm1, %xmm3 -; SSE2-NEXT: paddw %xmm0, %xmm3 ; SSE2-NEXT: psrlw $1, %xmm3 ; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: paddw %xmm3, %xmm0 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -504,10 +504,10 @@ ; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm7, %xmm3 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm4, %xmm3 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-FALLBACK-NEXT: retq ; @@ -527,8 +527,8 @@ ; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-FALLBACK-LABEL: vec256_i64_signed_reg_reg: @@ -564,10 +564,10 @@ ; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm7, %xmm3 ; XOP-FALLBACK-NEXT: vpsllq $32, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm3, %xmm3 -; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3 -; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm4, %xmm3 ; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOP-FALLBACK-NEXT: retq ; @@ -604,10 +604,10 @@ ; XOPAVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3 ; XOPAVX1-NEXT: vpsllq $32, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 -; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -630,8 +630,8 @@ ; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: vec256_i64_signed_reg_reg: @@ -665,8 +665,8 @@ ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: retq %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed %t4 = select <4 x i1> %t3, <4 x i64> , <4 x i64> @@ -718,10 +718,10 @@ ; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm6, %xmm2 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm3, %xmm3 -; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm8, %xmm2 +; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm8, %xmm3 ; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-FALLBACK-NEXT: retq ; @@ -745,8 +745,8 @@ ; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-FALLBACK-LABEL: vec256_i64_unsigned_reg_reg: @@ -782,10 +782,10 @@ ; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm7, %xmm3 ; XOP-FALLBACK-NEXT: vpsllq $32, %xmm3, %xmm3 ; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm3, %xmm3 -; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3 -; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm4, %xmm3 ; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOP-FALLBACK-NEXT: retq ; @@ -822,10 +822,10 @@ ; XOPAVX1-NEXT: vpaddq %xmm3, %xmm7, %xmm3 ; XOPAVX1-NEXT: vpsllq $32, %xmm3, %xmm3 ; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 -; XOPAVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 -; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -848,8 +848,8 @@ ; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: vec256_i64_unsigned_reg_reg: @@ -883,8 +883,8 @@ ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: retq %t3 = icmp ugt <4 x i64> %a1, %a2 %t4 = select <4 x i1> %t3, <4 x i64> , <4 x i64> @@ -932,10 +932,10 @@ ; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm7, %xmm4 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm4, %xmm4 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm3, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-FALLBACK-NEXT: retq ; @@ -956,8 +956,8 @@ ; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 ; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; XOP-FALLBACK-LABEL: vec256_i64_signed_mem_reg: @@ -994,10 +994,10 @@ ; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm7, %xmm4 ; XOP-FALLBACK-NEXT: vpsllq $32, %xmm4, %xmm4 ; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm3, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-FALLBACK-NEXT: retq ; @@ -1035,10 +1035,10 @@ ; XOPAVX1-NEXT: vpaddq %xmm4, %xmm7, %xmm4 ; XOPAVX1-NEXT: vpsllq $32, %xmm4, %xmm4 ; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; XOPAVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -1061,8 +1061,8 @@ ; AVX512F-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0 ; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: vec256_i64_signed_mem_reg: @@ -1097,8 +1097,8 @@ ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512BW-FALLBACK-NEXT: retq %a1 = load <4 x i64>, ptr %a1_addr %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed @@ -1145,10 +1145,10 @@ ; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm7, %xmm2 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm3, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-FALLBACK-NEXT: retq ; @@ -1169,8 +1169,8 @@ ; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-FALLBACK-LABEL: vec256_i64_signed_reg_mem: @@ -1207,10 +1207,10 @@ ; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm7, %xmm2 ; XOP-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm3, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-FALLBACK-NEXT: retq ; @@ -1248,10 +1248,10 @@ ; XOPAVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2 ; XOPAVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -1274,8 +1274,8 @@ ; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: vec256_i64_signed_reg_mem: @@ -1310,8 +1310,8 @@ ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: retq %a2 = load <4 x i64>, ptr %a2_addr %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed @@ -1359,10 +1359,10 @@ ; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm7, %xmm1 ; AVX1-FALLBACK-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm5, %xmm1 -; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; AVX1-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; AVX1-FALLBACK-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX1-FALLBACK-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-FALLBACK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ; AVX1-FALLBACK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-FALLBACK-NEXT: retq ; @@ -1384,8 +1384,8 @@ ; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; XOP-FALLBACK-LABEL: vec256_i64_signed_mem_mem: @@ -1423,10 +1423,10 @@ ; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm7, %xmm2 ; XOP-FALLBACK-NEXT: vpsllq $32, %xmm2, %xmm2 ; XOP-FALLBACK-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; XOP-FALLBACK-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; XOP-FALLBACK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; XOP-FALLBACK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOP-FALLBACK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOP-FALLBACK-NEXT: retq ; @@ -1465,10 +1465,10 @@ ; XOPAVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2 ; XOPAVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5 -; XOPAVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; XOPAVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3 +; XOPAVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; XOPAVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; XOPAVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -1491,8 +1491,8 @@ ; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: vec256_i64_signed_mem_mem: @@ -1528,8 +1528,8 @@ ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1 ; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX512BW-FALLBACK-NEXT: retq %a1 = load <4 x i64>, ptr %a1_addr %a2 = load <4 x i64>, ptr %a2_addr diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll --- a/llvm/test/CodeGen/X86/midpoint-int.ll +++ b/llvm/test/CodeGen/X86/midpoint-int.ll @@ -304,41 +304,40 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmpl %ebp, %eax -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: setl %cl -; X86-NEXT: movzbl %cl, %edx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: sbbl %ebp, %edx +; X86-NEXT: setl %dl +; X86-NEXT: movzbl %dl, %ebx ; X86-NEXT: jl .LBB5_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl %ebp, %esi ; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: jmp .LBB5_3 ; X86-NEXT: .LBB5_1: -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: .LBB5_3: -; X86-NEXT: negl %edx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: orl $1, %ebp -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: subl %esi, %eax +; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: imull %eax, %ebx +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: shrl %edi -; X86-NEXT: imull %eax, %edx ; X86-NEXT: imull %ebp, %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: mull %ebp ; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -377,43 +376,42 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl %ebx, %eax -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: setb %cl -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: testb %cl, %cl +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: sbbl %ebp, %edx +; X86-NEXT: setb %dl +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: testb %dl, %dl ; X86-NEXT: jne .LBB6_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: jmp .LBB6_3 ; X86-NEXT: .LBB6_1: -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: .LBB6_3: -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: orl $1, %ebx -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: sbbl %ebp, %edi +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: orl $1, %ebp +; X86-NEXT: subl %esi, %eax +; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: imull %eax, %ebx +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: shrl %edi -; X86-NEXT: imull %eax, %edx -; X86-NEXT: imull %ebx, %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: mull %ebx +; X86-NEXT: imull %ebp, %edi ; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -458,39 +456,39 @@ ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl (%edx), %ecx -; X86-NEXT: movl 4(%edx), %esi -; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl (%ecx), %esi +; X86-NEXT: movl 4(%ecx), %ecx +; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: movl %edi, %edx -; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: setl %dl -; X86-NEXT: movzbl %dl, %edx +; X86-NEXT: movzbl %dl, %ebx ; X86-NEXT: jl .LBB7_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl %esi, %edx ; X86-NEXT: jmp .LBB7_3 ; X86-NEXT: .LBB7_1: ; X86-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: .LBB7_3: -; X86-NEXT: negl %edx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: orl $1, %ebp -; X86-NEXT: subl %ebx, %eax +; X86-NEXT: subl %edx, %eax ; X86-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload ; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: imull %eax, %ebx +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: shrl %edi -; X86-NEXT: imull %eax, %edx ; X86-NEXT: imull %ebp, %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: mull %ebp ; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -533,42 +531,41 @@ ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl (%ecx), %eax -; X86-NEXT: movl 4(%ecx), %edi -; X86-NEXT: cmpl %ebp, %eax -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: sbbl %esi, %ecx -; X86-NEXT: setl %cl -; X86-NEXT: movzbl %cl, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl (%edx), %eax +; X86-NEXT: movl 4(%edx), %edi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: sbbl %ebp, %edx +; X86-NEXT: setl %dl +; X86-NEXT: movzbl %dl, %ebx ; X86-NEXT: jl .LBB8_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: movl %ebp, %esi ; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: jmp .LBB8_3 ; X86-NEXT: .LBB8_1: -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %ebp, %esi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: .LBB8_3: -; X86-NEXT: negl %edx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: orl $1, %ebp -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: subl %esi, %eax +; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: imull %eax, %ebx +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: shrl %edi -; X86-NEXT: imull %eax, %edx ; X86-NEXT: imull %ebp, %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: mull %ebp ; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -613,40 +610,40 @@ ; X86-NEXT: pushl %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %ecx -; X86-NEXT: movl 4(%eax), %esi +; X86-NEXT: movl (%eax), %esi +; X86-NEXT: movl 4(%eax), %ecx ; X86-NEXT: movl (%edx), %eax ; X86-NEXT: movl 4(%edx), %edi -; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: movl %edi, %edx -; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: setl %dl -; X86-NEXT: movzbl %dl, %edx +; X86-NEXT: movzbl %dl, %ebx ; X86-NEXT: jl .LBB9_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl %esi, %edx ; X86-NEXT: jmp .LBB9_3 ; X86-NEXT: .LBB9_1: ; X86-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: .LBB9_3: -; X86-NEXT: negl %edx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: orl $1, %ebp -; X86-NEXT: subl %ebx, %eax +; X86-NEXT: subl %edx, %eax ; X86-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload ; X86-NEXT: shrdl $1, %edi, %eax +; X86-NEXT: imull %eax, %ebx +; X86-NEXT: mull %ebp +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: shrl %edi -; X86-NEXT: imull %eax, %edx ; X86-NEXT: imull %ebp, %edi -; X86-NEXT: addl %edx, %edi -; X86-NEXT: mull %ebp ; X86-NEXT: addl %edi, %edx -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl $4, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -130,9 +130,9 @@ ; SSE-LABEL: allones_v64i8_sign: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax ; SSE-NEXT: cmpw $-1, %ax ; SSE-NEXT: sete %al ; SSE-NEXT: retq @@ -141,9 +141,9 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpw $-1, %ax ; AVX1-NEXT: sete %al @@ -186,9 +186,9 @@ ; SSE-LABEL: allzeros_v64i8_sign: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: sete %al ; SSE-NEXT: retq @@ -198,8 +198,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: sete %al @@ -1322,10 +1322,10 @@ ; SSE-LABEL: allones_v64i8_and1: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: psllw $7, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: psllw $7, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax ; SSE-NEXT: cmpw $-1, %ax ; SSE-NEXT: sete %al ; SSE-NEXT: retq @@ -1334,9 +1334,9 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpw $-1, %ax @@ -1383,10 +1383,10 @@ ; SSE-LABEL: allzeros_v64i8_and1: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: sete %al ; SSE-NEXT: retq @@ -1396,8 +1396,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax @@ -2611,10 +2611,10 @@ ; SSE-LABEL: allones_v64i8_and4: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: psllw $5, %xmm0 -; SSE-NEXT: pmovmskb %xmm0, %eax +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: psllw $5, %xmm1 +; SSE-NEXT: pmovmskb %xmm1, %eax ; SSE-NEXT: cmpw $-1, %ax ; SSE-NEXT: sete %al ; SSE-NEXT: retq @@ -2623,9 +2623,9 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpw $-1, %ax @@ -2672,10 +2672,10 @@ ; SSE-LABEL: allzeros_v64i8_and4: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: psllw $5, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $5, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: sete %al ; SSE-NEXT: retq @@ -2685,8 +2685,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax diff --git a/llvm/test/CodeGen/X86/mul-constant-i64.ll b/llvm/test/CodeGen/X86/mul-constant-i64.ll --- a/llvm/test/CodeGen/X86/mul-constant-i64.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i64.ll @@ -497,18 +497,13 @@ define i64 @test_mul_by_17(i64 %x) { ; X86-LABEL: test_mul_by_17: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: shll $4, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shll $4, %ecx +; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl $17, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: popl %esi -; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; ; X86-NOOPT-LABEL: test_mul_by_17: @@ -697,8 +692,8 @@ ; X86-NEXT: leal (%ecx,%eax,4), %esi ; X86-NEXT: movl $22, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: addl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -999,8 +994,8 @@ ; X86-NEXT: addl %ecx, %ecx ; X86-NEXT: movl $29, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %edx ; X86-NEXT: addl %ecx, %edx +; X86-NEXT: addl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -1522,8 +1517,8 @@ ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: imull %esi, %ebx +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: imull %ecx, %edi -; X86-NEXT: addl %ebx, %edi ; X86-NEXT: addl %edi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -1558,8 +1553,8 @@ ; X86-NOOPT-NEXT: movl %esi, %eax ; X86-NOOPT-NEXT: mull %edi ; X86-NOOPT-NEXT: imull %esi, %ebx +; X86-NOOPT-NEXT: addl %ebx, %edx ; X86-NOOPT-NEXT: imull %ecx, %edi -; X86-NOOPT-NEXT: addl %ebx, %edi ; X86-NOOPT-NEXT: addl %edi, %edx ; X86-NOOPT-NEXT: popl %esi ; X86-NOOPT-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll --- a/llvm/test/CodeGen/X86/mul-constant-result.ll +++ b/llvm/test/CodeGen/X86/mul-constant-result.ll @@ -524,15 +524,18 @@ define i32 @foo() local_unnamed_addr #0 { ; X86-LABEL: foo: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: pushl %edi +; X86-NEXT: pushl %ebx ; X86-NEXT: .cfi_def_cfa_offset 12 -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %edi ; X86-NEXT: .cfi_def_cfa_offset 16 -; X86-NEXT: .cfi_offset %esi, -16 -; X86-NEXT: .cfi_offset %edi, -12 -; X86-NEXT: .cfi_offset %ebx, -8 +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 20 +; X86-NEXT: .cfi_offset %esi, -20 +; X86-NEXT: .cfi_offset %edi, -16 +; X86-NEXT: .cfi_offset %ebx, -12 +; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: pushl $0 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $1 @@ -549,8 +552,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $2, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl $2, %edi +; X86-NEXT: orl %esi, %edi ; X86-NEXT: pushl $1 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $3 @@ -558,9 +562,8 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $3, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $3, %ebx ; X86-NEXT: pushl $2 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $4 @@ -568,9 +571,10 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $4, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $4, %esi +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %edi, %esi ; X86-NEXT: pushl $2 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $5 @@ -580,7 +584,6 @@ ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi ; X86-NEXT: xorl $5, %edi -; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $3 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $6 @@ -601,6 +604,7 @@ ; X86-NEXT: movl %eax, %edi ; X86-NEXT: xorl $7, %edi ; X86-NEXT: orl %ebx, %edi +; X86-NEXT: orl %esi, %edi ; X86-NEXT: pushl $4 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $8 @@ -610,7 +614,6 @@ ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: xorl $8, %ebx -; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $4 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $9 @@ -618,9 +621,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $9, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $9, %esi +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: pushl $5 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $10 @@ -630,7 +633,7 @@ ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: xorl $10, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: pushl $5 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $11 @@ -638,9 +641,10 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $11, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $11, %esi +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %edi, %esi ; X86-NEXT: pushl $6 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $12 @@ -650,7 +654,6 @@ ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: xorl $12, %ebx -; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $6 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $13 @@ -678,9 +681,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $15, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: xorl $15, %ebp +; X86-NEXT: orl %ebx, %ebp ; X86-NEXT: pushl $8 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $16 @@ -688,9 +691,10 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $16, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl $16, %edi +; X86-NEXT: orl %ebp, %edi +; X86-NEXT: orl %esi, %edi ; X86-NEXT: pushl $8 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $17 @@ -698,9 +702,8 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $17, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $17, %ebx ; X86-NEXT: pushl $9 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $18 @@ -708,9 +711,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $18, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $18, %esi +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: pushl $9 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $19 @@ -718,9 +721,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $19, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $19, %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: pushl $10 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $20 @@ -728,9 +731,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $20, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $20, %esi +; X86-NEXT: orl %ebx, %esi ; X86-NEXT: pushl $10 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $21 @@ -738,9 +741,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $21, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $21, %ebx +; X86-NEXT: orl %esi, %ebx ; X86-NEXT: pushl $11 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $22 @@ -748,9 +751,10 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $22, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $22, %esi +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %edi, %esi ; X86-NEXT: pushl $11 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $23 @@ -758,9 +762,8 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $23, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $23, %ebx ; X86-NEXT: pushl $12 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $24 @@ -768,9 +771,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $24, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl $24, %edi +; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $12 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $25 @@ -778,9 +781,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $25, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $25, %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $13 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $26 @@ -788,9 +791,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $26, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: xorl $26, %edi +; X86-NEXT: orl %ebx, %edi ; X86-NEXT: pushl $13 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $27 @@ -798,9 +801,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $27, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: xorl $27, %ebx +; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $14 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $28 @@ -808,9 +811,9 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: xorl $28, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: xorl $28, %ebp +; X86-NEXT: orl %ebx, %ebp ; X86-NEXT: pushl $14 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $29 @@ -820,7 +823,8 @@ ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %edi ; X86-NEXT: xorl $29, %edi -; X86-NEXT: orl %ebx, %edi +; X86-NEXT: orl %ebp, %edi +; X86-NEXT: orl %esi, %edi ; X86-NEXT: pushl $15 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $30 @@ -830,7 +834,6 @@ ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: xorl $30, %ebx -; X86-NEXT: orl %edi, %ebx ; X86-NEXT: pushl $15 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $31 @@ -838,10 +841,10 @@ ; X86-NEXT: calll mult@PLT ; X86-NEXT: addl $8, %esp ; X86-NEXT: .cfi_adjust_cfa_offset -8 -; X86-NEXT: movl %eax, %edi -; X86-NEXT: xorl $31, %edi -; X86-NEXT: orl %ebx, %edi -; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %edi, %esi ; X86-NEXT: pushl $16 ; X86-NEXT: .cfi_adjust_cfa_offset 4 ; X86-NEXT: pushl $32 @@ -851,15 +854,17 @@ ; X86-NEXT: .cfi_adjust_cfa_offset -8 ; X86-NEXT: xorl $32, %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: setne %cl ; X86-NEXT: negl %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi -; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi -; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: popl %ebx +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: popl %ebp ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -10,111 +10,112 @@ ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: subl $400, %esp # imm = 0x190 -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 60(%ecx), %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 56(%ecx), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%ebp), %esi +; X32-NEXT: movl 60(%ecx), %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl 56(%ecx), %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%eax), %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl 4(%ebp), %ecx -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 4(%eax), %ecx +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %ebp +; X32-NEXT: movl %ecx, %edi ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %edi ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 48(%esi), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 48(%edi), %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 52(%esi), %eax +; X32-NEXT: movl 52(%edi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 8(%eax), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 8(%eax), %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl 12(%eax), %edi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -122,165 +123,166 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 40(%esi), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl 40(%esi), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 44(%esi), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi +; X32-NEXT: movl 44(%esi), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 32(%edi), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl 32(%ebp), %edi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl 36(%edi), %edi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl 36(%ebp), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %esi, %ebx +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: movl %edi, %esi +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movl (%esp), %esi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X32-NEXT: adcl %edi, %eax @@ -288,7 +290,7 @@ ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -298,66 +300,67 @@ ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 16(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl 16(%eax), %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 20(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl 20(%eax), %edx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %edx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %edi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %esi +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -365,78 +368,77 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 24(%eax), %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 28(%eax), %ecx -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebp, %edi -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %bl, %edi +; X32-NEXT: adcl %edi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -444,71 +446,72 @@ ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl %edi, %edx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb %bl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -516,74 +519,73 @@ ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: adcl %ebx, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: movl %ebx, %ecx +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X32-NEXT: adcl %edi, %eax -; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %eax +; X32-NEXT: movl %eax, %esi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -593,115 +595,116 @@ ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 24(%esi), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 28(%esi), %ebp +; X32-NEXT: movl 24(%esi), %ebp ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl 28(%esi), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: setb %bl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 16(%esi), %edi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 16(%edi), %esi +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 20(%esi), %eax +; X32-NEXT: movl 20(%edi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %ebx +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax @@ -712,89 +715,89 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl 8(%esi), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 8(%esi), %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 12(%esi), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl 12(%esi), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl (%esi), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: setb %bl +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl 4(%esi), %eax +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl (%ebp), %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl 4(%ebp), %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -802,43 +805,44 @@ ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %esi, %ebx +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx @@ -846,33 +850,33 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X32-NEXT: adcl %ebx, %eax +; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: adcl %edi, %eax ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -880,130 +884,130 @@ ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: setb %bl ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: mull %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %ebp, %ecx ; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %edi +; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edi, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movzbl %bl, %edi +; X32-NEXT: adcl %edi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx @@ -1018,41 +1022,42 @@ ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl %edi, %edx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb %bl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %edi ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax @@ -1062,98 +1067,96 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebp +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: addl %edi, %ebx ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax @@ -1170,7 +1173,7 @@ ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl $0, %ebx @@ -1184,9 +1187,9 @@ ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload @@ -1201,50 +1204,53 @@ ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X32-NEXT: movl 32(%ebp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl 32(%ebx), %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %edi ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl 36(%ebp), %ebp -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 36(%eax), %ecx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %ebx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %ecx, %ebp ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx @@ -1252,8 +1258,7 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %ebx, %ecx @@ -1265,24 +1270,23 @@ ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 40(%eax), %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: mull %ebp +; X32-NEXT: movl 40(%eax), %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 44(%eax), %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebx +; X32-NEXT: movl 44(%eax), %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi @@ -1290,17 +1294,16 @@ ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -1317,20 +1320,20 @@ ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %ebx, %ecx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -1338,60 +1341,61 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl (%esp), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebp ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1400,75 +1404,76 @@ ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl %esi, %edi ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebx, %ecx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload ; X32-NEXT: adcl %ebx, %eax ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1478,34 +1483,33 @@ ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 48(%eax), %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl 48(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, %ecx -; X32-NEXT: mull %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 52(%eax), %edx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %edx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb %bl ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %esi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax @@ -1513,29 +1517,29 @@ ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: setb %bl +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, %esi ; X32-NEXT: addl %ecx, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -1550,34 +1554,34 @@ ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 60(%eax), %ecx ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebx, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ebp, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -1589,25 +1593,25 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %bl, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X32-NEXT: adcl %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl %ebx, %edi ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax @@ -1621,7 +1625,7 @@ ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %ebx, %edx +; X32-NEXT: movl %edi, %edx ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %ebp, %ecx ; X32-NEXT: adcl $0, %ecx @@ -1664,27 +1668,27 @@ ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %edi, %esi ; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx @@ -1704,30 +1708,30 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %edi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -1739,30 +1743,30 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: addl %edi, %ebx ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: movl %ecx, %ebx -; X32-NEXT: adcl %eax, %ebx +; X32-NEXT: movl %ecx, %edi +; X32-NEXT: adcl %eax, %edi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1775,11 +1779,10 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -1790,7 +1793,7 @@ ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -1803,7 +1806,6 @@ ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1847,54 +1849,55 @@ ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: addl %ebx, %ebp ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebx, %esi +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload @@ -1902,24 +1905,24 @@ ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: adcl %ebp, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -1927,7 +1930,7 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi @@ -1947,7 +1950,7 @@ ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill +; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -1957,267 +1960,268 @@ ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebx ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %ebp, %ecx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movl %ebp, %esi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload ; X32-NEXT: adcl %ebx, %eax ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %ebp -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebp +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb %bl +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %esi ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi ; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ebx, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %bl, %ecx -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx @@ -2230,63 +2234,62 @@ ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %ebx, %edx +; X32-NEXT: movl %edi, %edx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %edi, %ecx -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi @@ -2296,83 +2299,82 @@ ; X32-NEXT: setb %bl ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: adcl %ebx, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %edi, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb (%esp) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: addl %ebx, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload -; X32-NEXT: movl %edi, %ebp -; X32-NEXT: adcl %eax, %ebp +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -2384,14 +2386,14 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl %eax, %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -2418,141 +2420,143 @@ ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 64(%eax), %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 68(%eax), %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ecx -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl 68(%eax), %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb %bl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ebp -; X32-NEXT: setb %cl -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi -; X32-NEXT: movzbl %cl, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: setb %bl +; X32-NEXT: movl %ecx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 72(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl 72(%eax), %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 76(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: movl 76(%eax), %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %ebp +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %esi +; X32-NEXT: adcl %edi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -2562,96 +2566,97 @@ ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb %bl ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebp ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %esi, %edi +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %esi ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -2659,44 +2664,42 @@ ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl %bl, %ebx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: addl %ecx, %edx +; X32-NEXT: addl %ecx, %eax +; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X32-NEXT: adcl %ebx, %eax -; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X32-NEXT: adcl %edi, %eax +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -2704,63 +2707,64 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 80(%eax), %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 84(%eax), %ecx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %ebx +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ecx, %ebp ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb %bl ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %edi -; X32-NEXT: setb %bl -; X32-NEXT: movl %esi, %eax +; X32-NEXT: adcl %esi, %edi +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -2768,39 +2772,38 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 88(%eax), %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 92(%eax), %edi -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, %esi -; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %ebp, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -2813,26 +2816,26 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: movl %edi, %ecx +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: adcl $0, %edx @@ -2845,16 +2848,15 @@ ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl %ebx, %edx +; X32-NEXT: movl %edi, %edx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: movl %ebp, %ecx -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload @@ -2864,130 +2866,130 @@ ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi +; X32-NEXT: addl %ecx, %edi ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %edi +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %ebp, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ecx +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %esi, %edi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, %edi +; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: movl %edx, %edi -; X32-NEXT: adcl %eax, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: addl %ecx, %edx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: mull %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: addl %ebx, %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -2997,14 +2999,14 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -3039,7 +3041,7 @@ ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx @@ -3086,7 +3088,7 @@ ; X32-NEXT: movl %eax, %esi ; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %edi @@ -3135,52 +3137,56 @@ ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %esi, %ecx +; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: addl %edx, %ebx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %esi +; X32-NEXT: addl %edx, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ecx, %edx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %ebx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %esi, %ebp -; X32-NEXT: addl %edx, %ebp -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %edx, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl %ebx, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax +; X32-NEXT: imull %edi, %esi +; X32-NEXT: addl %edx, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %edi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebp -; X32-NEXT: setb %bl +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -3190,48 +3196,50 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%esp), %ebx # 4-byte Reload -; X32-NEXT: imull %ebx, %ebp -; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: addl %edx, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %ebx, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: imull %edi, %edx +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %ecx, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: mull %ecx -; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: imull %edi, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %esi, %ecx ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl %ecx, %edi ; X32-NEXT: adcl %ebp, %ebx ; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull (%esp) # 4-byte Folded Reload +; X32-NEXT: mull %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: adcl %esi, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload @@ -3240,79 +3248,79 @@ ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, (%esp) # 4-byte Spill +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 104(%ecx), %ebp -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 104(%esi), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl 108(%ecx), %ecx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi -; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl 108(%esi), %esi +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %edi, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebx -; X32-NEXT: movl %ebx, %edi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 96(%ecx), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 96(%esi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 100(%ecx), %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl 100(%esi), %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: adcl %ebp, %esi ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -3323,31 +3331,32 @@ ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi ; X32-NEXT: adcl %ebx, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: mull %esi +; X32-NEXT: movl %esi, %ebx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebp, %esi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl %eax, %edx +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ecx -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -3358,21 +3367,21 @@ ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ebx, %ecx +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: adcl %edi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl %ebx, %ebp +; X32-NEXT: mull %ecx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax @@ -3380,98 +3389,101 @@ ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 112(%ecx), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull %eax, %edi +; X32-NEXT: movl 112(%ecx), %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl 116(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: imull %eax, %ebx -; X32-NEXT: addl %edi, %ebx ; X32-NEXT: addl %edx, %ebx ; X32-NEXT: movl 120(%ecx), %eax -; X32-NEXT: movl %ecx, %edx -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %esi, %ecx -; X32-NEXT: movl 124(%edx), %edi +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: imull %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %edi -; X32-NEXT: addl %ecx, %edi ; X32-NEXT: mull %ebp -; X32-NEXT: addl %edx, %edi +; X32-NEXT: addl %esi, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl 124(%esi), %esi +; X32-NEXT: imull %ebp, %esi +; X32-NEXT: addl %edx, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %edi +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl %ebp, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebp +; X32-NEXT: adcl %ecx, %ebp ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi ; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movzbl %bl, %esi -; X32-NEXT: adcl %esi, %edx +; X32-NEXT: movzbl %bl, %ecx +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %edx +; X32-NEXT: adcl %esi, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: addl %edx, %ebx +; X32-NEXT: imull %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %edx +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl %edx, %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %ebx, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %edi, %ecx -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: mull %edi +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %edi, %edx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: imull %eax, %ecx ; X32-NEXT: addl %edx, %ecx -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebp, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl %ebp, %ebx ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %ebx, %ecx ; X32-NEXT: adcl %edi, %ebp ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3480,10 +3492,9 @@ ; X32-NEXT: movzbl %bl, %edi ; X32-NEXT: adcl %edi, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -3505,7 +3516,7 @@ ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload @@ -3542,7 +3553,7 @@ ; X32-NEXT: addl %ecx, %edi ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax @@ -3633,7 +3644,7 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi @@ -3653,7 +3664,7 @@ ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %ebx, (%esp) # 4-byte Folded Spill +; X32-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -3678,7 +3689,7 @@ ; X32-NEXT: addl %ecx, %edi ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl (%esp), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax @@ -3708,7 +3719,7 @@ ; X32-NEXT: addl %edi, %ebp ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl (%esp), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %ebp, %eax @@ -3803,7 +3814,7 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -3948,7 +3959,7 @@ ; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -3983,7 +3994,7 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -4006,6 +4017,7 @@ ; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb %bl ; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %edi, %ebp ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %ecx @@ -4015,18 +4027,18 @@ ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl %esi, %edi ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -4048,9 +4060,9 @@ ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -4081,7 +4093,7 @@ ; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %ebx ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4108,7 +4120,7 @@ ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi @@ -4140,7 +4152,7 @@ ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl (%esp), %ebx # 4-byte Reload ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi @@ -4161,14 +4173,14 @@ ; X32-NEXT: addl %esi, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 104(%eax), %esi -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi @@ -4190,7 +4202,7 @@ ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: adcl %ebp, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %esi @@ -4207,7 +4219,7 @@ ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4239,106 +4251,110 @@ ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %edi -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: movl %edi, %esi +; X32-NEXT: imull %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: addl %edi, %ecx ; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx +; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: imull %edi, %edx +; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ecx, %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: imull %ebp, %esi ; X32-NEXT: addl %edx, %esi -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edx, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %ebp ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 124(%edi), %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl 120(%edi), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl 120(%ebx), %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull (%esp), %esi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl 112(%edi), %ecx -; X32-NEXT: movl 116(%edi), %edi +; X32-NEXT: movl 124(%ebx), %eax +; X32-NEXT: imull %ecx, %eax +; X32-NEXT: addl %eax, %esi +; X32-NEXT: movl 112(%ebx), %edi +; X32-NEXT: movl 116(%ebx), %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx -; X32-NEXT: imull %edi, %edx -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %ecx, %ebx -; X32-NEXT: addl %edx, %ebx -; X32-NEXT: mull %ecx -; X32-NEXT: addl %edx, %ebx -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: imull %ebp, %ebx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: imull %edi, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %esi, %ebp ; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl (%esp), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %esi, %edi +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ebp +; X32-NEXT: mull %esi ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -4348,7 +4364,7 @@ ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, (%esp) # 4-byte Spill +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -4377,7 +4393,7 @@ ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4411,7 +4427,7 @@ ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax @@ -4445,9 +4461,9 @@ ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload @@ -4476,98 +4492,101 @@ ; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl %ecx, %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %edi -; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %edi, %esi +; X32-NEXT: imull %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: imull %ebp, %ecx -; X32-NEXT: addl %edi, %ecx ; X32-NEXT: addl %edx, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: imull %ebx, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %esi, %edx +; X32-NEXT: mull %esi +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %edi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ecx, %edi -; X32-NEXT: addl %edx, %edi -; X32-NEXT: mull %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: imull %esi, %edi ; X32-NEXT: addl %edx, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %esi, %ebx ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %edi, %ebp ; X32-NEXT: setb %cl -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movzbl %cl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %eax, %ecx +; X32-NEXT: imull %esi, %ecx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %eax, %edi +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: addl %edx, %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: imull %edi, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %ecx -; X32-NEXT: addl %edx, %ecx ; X32-NEXT: mull %ebp +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: imull %ebp, %ecx ; X32-NEXT: addl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ebp, %ebx +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi @@ -4583,7 +4602,7 @@ ; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -4605,9 +4624,9 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -4618,9 +4637,9 @@ ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -4658,9 +4677,9 @@ ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X32-NEXT: movl (%esp), %edx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4698,7 +4717,7 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl (%esp), %edi # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -4787,198 +4806,198 @@ ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq 40(%rdi), %rbx -; X64-NEXT: movq 32(%rdi), %r12 -; X64-NEXT: movq 56(%rdi), %r14 +; X64-NEXT: movq 32(%rdi), %r14 +; X64-NEXT: movq 56(%rdi), %r15 ; X64-NEXT: movq 48(%rdi), %r10 ; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq (%rsi), %r11 -; X64-NEXT: movq 8(%rsi), %rcx -; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq 8(%rsi), %r8 +; X64-NEXT: movq %rsi, %r12 ; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rsi, %r9 -; X64-NEXT: adcq $0, %rdi +; X64-NEXT: addq %rcx, %r9 +; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r9, %r8 -; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r9, %rcx +; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %r9d +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r10, %rsi +; X64-NEXT: adcq %r9, %r13 ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r10, %rdi -; X64-NEXT: adcq %r9, %r15 -; X64-NEXT: movq %r12, %rax ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rbx, %r14 +; X64-NEXT: movq %rbx, %r15 ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %r9, %r11 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r10, %r9 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %r9, %rbx ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r11 -; X64-NEXT: addq %rbp, %rbx -; X64-NEXT: adcq %r8, %r11 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%r13), %rcx +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq %rcx, %r11 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: adcq $0, %r13 ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r14, %rbp +; X64-NEXT: movq 16(%r12), %r8 +; X64-NEXT: movq %r14, %r10 ; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r8, %r14 +; X64-NEXT: addq %rdi, %r14 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq 24(%r13), %r13 -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r13 +; X64-NEXT: movq 24(%r12), %rbp +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: adcq %r9, %r12 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %r12, %r9 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r8 -; X64-NEXT: addq %rbx, %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rax, %rdi +; X64-NEXT: addq %rbx, %rcx +; X64-NEXT: movq %rcx, (%rsp) # 8-byte Spill ; X64-NEXT: adcq %r11, %r14 -; X64-NEXT: movq %r14, (%rsp) # 8-byte Spill +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: addq %rdi, %r9 -; X64-NEXT: adcq %r15, %r8 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: addq %rsi, %r9 +; X64-NEXT: adcq %r13, %rdi ; X64-NEXT: setb %r10b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r13 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %r11, %rax ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: adcq %rdi, %rsi -; X64-NEXT: setb %dil -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r13 -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: movzbl %dil, %esi -; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq %r9, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r8, %r11 +; X64-NEXT: adcq %rsi, %rcx +; X64-NEXT: setb %sil +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movzbl %sil, %ecx +; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: addq %r9, %r15 +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %rdi, %r11 ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r10b, %esi -; X64-NEXT: adcq %rsi, %rax +; X64-NEXT: movzbl %r10b, %ecx +; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 16(%rcx), %r10 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: mulq %r13 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq 16(%r8), %rsi +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq 24(%rcx), %r14 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq 24(%r8), %r14 ; X64-NEXT: movq %r14, %rax ; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %r13, %rbx +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rsi, %r8 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rcx, %r11 ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload ; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: addq %r8, %rsi -; X64-NEXT: adcq %rdi, %r11 +; X64-NEXT: addq %r11, %rsi +; X64-NEXT: adcq %rdi, %rbx ; X64-NEXT: setb %r10b ; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %r13, %r12 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r11, %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %rbx, %rdi ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rbp -; X64-NEXT: movq (%rcx), %r13 +; X64-NEXT: adcq %rax, %rcx +; X64-NEXT: movq (%r8), %r13 ; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq 8(%rcx), %rdi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: movq 8(%r8), %rax +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %r11, %r14 ; X64-NEXT: adcq $0, %rbx ; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq %r12, %rcx +; X64-NEXT: movq %r12, %r11 ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %rbx, %r12 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %r8, %rbp +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r11 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %r12, %rbx @@ -4986,32 +5005,32 @@ ; X64-NEXT: adcq %rax, %r11 ; X64-NEXT: addq %r9, %rbx ; X64-NEXT: adcq %rsi, %r11 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %r13, %rcx +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: adcq $0, %rcx +; X64-NEXT: movq %r13, %r10 ; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: mulq %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %rsi, %r14 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: adcq %r9, %r12 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r12, %rsi @@ -5023,199 +5042,203 @@ ; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: addq %r8, %rsi -; X64-NEXT: adcq %rbp, %r9 +; X64-NEXT: addq %rdi, %rsi +; X64-NEXT: adcq %rcx, %r9 ; X64-NEXT: setb %r10b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rdi, %r11 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq %r8, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: adcq $0, %r11 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %r11, %rcx -; X64-NEXT: adcq %r8, %rbx +; X64-NEXT: addq %rbx, %rcx +; X64-NEXT: adcq %r11, %r14 ; X64-NEXT: setb %r11b -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: mulq %r15 +; X64-NEXT: addq %r14, %rax ; X64-NEXT: movzbl %r11b, %edi ; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: addq %rsi, %r14 +; X64-NEXT: addq %rsi, %r12 ; X64-NEXT: adcq %r9, %rcx ; X64-NEXT: movzbl %r10b, %esi ; X64-NEXT: adcq %rsi, %rax ; X64-NEXT: adcq $0, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; X64-NEXT: adcq (%rsp), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq (%rsp), %rdx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq 32(%r10), %rcx -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq 32(%rcx), %rdi +; X64-NEXT: movq %r8, %r10 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r12, %r8 -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: movq %rbx, %r14 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rsi, %r11 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq 40(%r10), %rcx -; X64-NEXT: movq %r10, %rdi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r12 +; X64-NEXT: movq 40(%rcx), %rsi +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rsi, %r15 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r11, %rsi ; X64-NEXT: adcq %r9, %rbx ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rbx, %r11 ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r14 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %rdi, %r10 +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r14 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %rbx, %r13 ; X64-NEXT: adcq $0, %r14 -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %r15, %rbx +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, (%rsp) # 8-byte Spill -; X64-NEXT: adcq %r14, %r15 -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: adcq %r14, %r10 +; X64-NEXT: setb %r15b +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %r15, %r14 -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: addq %r10, %r14 +; X64-NEXT: movzbl %r15b, %eax ; X64-NEXT: adcq %rax, %rbx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload +; X64-NEXT: addq %r8, %r14 ; X64-NEXT: adcq %rsi, %rbx ; X64-NEXT: adcq $0, %r11 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq 48(%rdi), %r12 -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq 48(%rcx), %rcx +; X64-NEXT: movq %rbp, %r15 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %rsi, %r13 -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq 56(%rdi), %rsi -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq 56(%r8), %rsi +; X64-NEXT: movq %r15, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r13, %rdi -; X64-NEXT: adcq %r15, %rcx -; X64-NEXT: setb %r10b -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %r13, %r12 +; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: setb %r8b +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r8 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: addq %r15, %r13 +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %rsi ; X64-NEXT: addq %r14, %rbp -; X64-NEXT: movq %rbp, %r10 +; X64-NEXT: movq %rbp, %r8 +; X64-NEXT: movq %r12, %rdi ; X64-NEXT: adcq %rbx, %rdi ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: addq %r11, %r13 ; X64-NEXT: adcq %r9, %rsi -; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill +; X64-NEXT: setb %bpl ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq $0, %r9 +; X64-NEXT: addq %r9, %rbx +; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rbx, %r12 -; X64-NEXT: adcq %r9, %rcx -; X64-NEXT: setb %r15b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rbx, %r9 +; X64-NEXT: adcq %r10, %r15 +; X64-NEXT: setb %r10b ; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: movzbl %r15b, %eax +; X64-NEXT: addq %r15, %rbx +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r14 -; X64-NEXT: addq %r13, %rbp -; X64-NEXT: adcq %rsi, %r12 -; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; X64-NEXT: addq %r13, %r12 +; X64-NEXT: adcq %rsi, %r9 +; X64-NEXT: movzbl %bpl, %eax ; X64-NEXT: adcq %rax, %rbx ; X64-NEXT: adcq $0, %r14 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: adcq %rax, (%rsp) # 8-byte Folded Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rbp ; X64-NEXT: adcq $0, %r12 +; X64-NEXT: adcq $0, %r9 ; X64-NEXT: adcq $0, %rbx ; X64-NEXT: adcq $0, %r14 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload ; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill @@ -5224,127 +5247,127 @@ ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rsi, %r13 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rcx, %r10 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rdi, %r12 -; X64-NEXT: adcq %rsi, %rcx -; X64-NEXT: setb %r10b +; X64-NEXT: addq %r10, %r12 +; X64-NEXT: adcq %rsi, %r15 +; X64-NEXT: setb %r8b ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r15, %rsi +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %r9 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rcx, %r13 +; X64-NEXT: addq %r10, %r13 ; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r10, %r11 -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, %r11 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %rcx -; X64-NEXT: setb %r10b +; X64-NEXT: adcq %r15, %r10 +; X64-NEXT: setb %r8b ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: movq %rbp, %r15 -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: addq %r10, %rbp +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: addq %rdi, %rbp ; X64-NEXT: adcq %r12, %r13 -; X64-NEXT: adcq $0, %rdi +; X64-NEXT: adcq $0, %rsi ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r11, %r10 +; X64-NEXT: movq %r11, %r8 ; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq %r15, %r12 -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rcx, %r15 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: addq %r15, %rax ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: adcq %r8, %rcx +; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rcx, %r12 +; X64-NEXT: addq %r8, %r12 ; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %r8 +; X64-NEXT: adcq %rax, %rdi ; X64-NEXT: addq %rbp, %r11 ; X64-NEXT: adcq %r13, %r15 ; X64-NEXT: movq %r15, %rbp ; X64-NEXT: adcq $0, %r12 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: addq %rdi, %r12 -; X64-NEXT: adcq %r9, %r8 -; X64-NEXT: setb %r9b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r10 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: addq %rsi, %r12 +; X64-NEXT: adcq %r9, %rdi +; X64-NEXT: setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %rcx, %rsi +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rcx, %r8 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r13, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: mulq %r13 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r8, %rax +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r13 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movzbl %sil, %ecx ; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: addq %r12, %r10 -; X64-NEXT: adcq %r8, %rdi -; X64-NEXT: movzbl %r9b, %ecx +; X64-NEXT: adcq %rdi, %r8 +; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: adcq $0, %rdx @@ -5359,342 +5382,350 @@ ; X64-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; X64-NEXT: adcq %rax, %r10 ; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %r8 +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq 64(%r9), %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq 64(%r10), %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r15 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rcx, %r9 -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq 72(%r10), %rcx -; X64-NEXT: movq %r10, %rsi -; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %rdi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rsi, %r8 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq 72(%r9), %rsi +; X64-NEXT: movq %r9, %rcx +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rsi, %r13 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r9, %rbx -; X64-NEXT: adcq %r8, %rcx -; X64-NEXT: setb %r10b +; X64-NEXT: addq %r8, %rbx +; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: setb %r8b ; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r13 -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rcx, %r9 -; X64-NEXT: movzbl %r10b, %eax -; X64-NEXT: adcq %rax, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, %rax +; X64-NEXT: addq %r10, %r9 +; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: adcq %rax, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r15, %rdi ; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r8, %r14 +; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rcx, %r15 -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r12, %rdi +; X64-NEXT: movq %r13, %r12 +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r15, %rax +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: addq %r14, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %rcx +; X64-NEXT: adcq %r10, %r8 ; X64-NEXT: setb %r10b -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r13 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r15, %r13 +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: addq %r8, %rbp ; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r14 ; X64-NEXT: addq %r11, %rbp ; X64-NEXT: adcq %rbx, %r14 ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq %rsi, %r10 -; X64-NEXT: movq 80(%rsi), %r15 -; X64-NEXT: movq %r8, %rax +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: movq 80(%rcx), %r15 +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx -; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq 88(%r10), %r10 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: adcq %r11, %rcx -; X64-NEXT: setb %r11b -; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r8, %r11 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq 88(%rbx), %rbx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: addq %r11, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rcx, %r13 -; X64-NEXT: movzbl %r11b, %eax +; X64-NEXT: addq %r8, %r13 +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %rbp, %rsi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %rbp, %rcx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r14, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r13 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: addq %r9, %r13 -; X64-NEXT: adcq %rdi, %r12 -; X64-NEXT: setb %r9b -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; X64-NEXT: movq %r14, %rax +; X64-NEXT: adcq %rsi, %r12 +; X64-NEXT: setb %bpl +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rdi, %r10 ; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: adcq %r8, %rcx +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: addq %r10, %rax +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: adcq %r8, %rdi ; X64-NEXT: setb %r8b -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: addq %rcx, %rax +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movzbl %r8b, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r13, %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %rsi +; X64-NEXT: addq %r13, %rsi ; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %r9b, %ecx +; X64-NEXT: adcq %r12, %r9 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movzbl %bpl, %ecx ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %rax, %r10 -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: imulq %rax, %rbx +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: imulq %rsi, %r15 -; X64-NEXT: addq %r10, %r15 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rbx, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: imulq %rcx, %r15 ; X64-NEXT: addq %rdx, %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %rdx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: imulq %r14, %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r10, %rdx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: imulq %rbx, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: imulq %rdi, %r10 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: adcq %r15, %r10 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r15 +; X64-NEXT: imulq %rsi, %rbx +; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: addq %r8, %rdi +; X64-NEXT: adcq %r15, %rbx +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r15, %r13 -; X64-NEXT: adcq $0, %r12 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %r13, %r15 -; X64-NEXT: adcq %r12, %rbp -; X64-NEXT: setb %cl -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: addq %r8, %r15 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r15, %r11 +; X64-NEXT: adcq %r10, %r8 +; X64-NEXT: setb %r10b +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %rbp, %r12 -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r8, %r15 +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: addq %r8, %r12 -; X64-NEXT: adcq %r10, %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; X64-NEXT: movq 120(%rdx), %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq 112(%rdx), %r9 -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: mulq %r9 +; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: adcq %rbx, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq 112(%r9), %rbx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; X64-NEXT: imulq %rbx, %r9 -; X64-NEXT: addq %rcx, %r9 -; X64-NEXT: addq %rdx, %r9 -; X64-NEXT: movq 96(%rdi), %rbp -; X64-NEXT: movq 104(%rdi), %rdi -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload +; X64-NEXT: imulq %rcx, %rbx +; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: movq 120(%r9), %rax ; X64-NEXT: imulq %rdi, %rax -; X64-NEXT: imulq %rbp, %r11 -; X64-NEXT: addq %rax, %r11 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: movq %rdi, %rbp +; X64-NEXT: addq %rax, %rbx +; X64-NEXT: movq 96(%r9), %r10 +; X64-NEXT: movq 104(%r9), %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: imulq %rdi, %r12 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rdx, %r11 +; X64-NEXT: addq %r12, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: imulq %r10, %r14 +; X64-NEXT: addq %rdx, %r14 ; X64-NEXT: addq %r8, %r13 -; X64-NEXT: adcq %r9, %r11 -; X64-NEXT: movq %r11, %r14 -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: adcq %rbx, %r14 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r8, %r9 -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r9, %r8 -; X64-NEXT: adcq %rcx, %rbp -; X64-NEXT: setb %cl +; X64-NEXT: movq %rax, %r12 +; X64-NEXT: addq %r8, %r12 +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %r12, %rbx +; X64-NEXT: adcq %rbp, %r10 +; X64-NEXT: setb %r8b ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: addq %rbp, %rax -; X64-NEXT: movzbl %cl, %ecx -; X64-NEXT: adcq %rcx, %rdx +; X64-NEXT: mulq %rcx +; X64-NEXT: addq %r10, %rax +; X64-NEXT: movzbl %r8b, %edi +; X64-NEXT: adcq %rdi, %rdx ; X64-NEXT: addq %r13, %rax ; X64-NEXT: adcq %r14, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; X64-NEXT: adcq %r15, %r8 -; X64-NEXT: adcq %r12, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: adcq %r11, %rbx +; X64-NEXT: adcq %r15, %rax ; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq 80(%r8), %r9 -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload +; X64-NEXT: movq 80(%r13), %r8 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq %rax, %rbx ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 88(%r8), %rbx -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq 88(%r13), %r11 +; X64-NEXT: movq %r13, %r10 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rsi, %r11 +; X64-NEXT: movq %rsi, %r9 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %rcx, %rdi ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %rdi, %r14 ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %dil -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, %r11 +; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %rcx, %rsi ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: adcq %rax, %rbx -; X64-NEXT: movq 64(%r8), %rdi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: adcq %rax, %r13 +; X64-NEXT: movq %r10, %rdi +; X64-NEXT: movq 64(%r10), %r10 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq 72(%r8), %rax -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: mulq %r11 +; X64-NEXT: movq 72(%rdi), %rax +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %rcx, %r12 ; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r8 -; X64-NEXT: mulq %r9 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq %r11, %r9 +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %r12, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r15, %rcx ; X64-NEXT: setb %dil -; X64-NEXT: movq %r13, %r11 -; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r8, %r11 +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: adcq %rax, %r12 -; X64-NEXT: addq %r10, %rbp +; X64-NEXT: addq %rbx, %rbp ; X64-NEXT: adcq %r14, %r12 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: mulq %r13 +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq %r10, %rdi +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r10, %rax +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %rbx ; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %r13 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %rcx, %r14 ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r14, %r8 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: adcq %r10, %rcx ; X64-NEXT: setb %dil -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r14 @@ -5703,26 +5734,26 @@ ; X64-NEXT: adcq %rax, %r10 ; X64-NEXT: addq %rbp, %r9 ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r12, %r8 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r12, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %r14 ; X64-NEXT: adcq $0, %r10 ; X64-NEXT: addq %rsi, %r14 -; X64-NEXT: adcq %rbx, %r10 +; X64-NEXT: adcq %r13, %r10 ; X64-NEXT: setb %dil -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %r13 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r8 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r13 +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %r9, %rax @@ -5734,8 +5765,8 @@ ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movzbl %sil, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: addq %r14, %r8 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: addq %r14, %r12 +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq %r10, %r9 ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movzbl %dil, %ecx @@ -5743,163 +5774,160 @@ ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq 96(%r8), %rcx -; X64-NEXT: imulq %rcx, %r15 -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %r13, %r9 -; X64-NEXT: mulq %r13 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq 104(%r8), %rdi -; X64-NEXT: imulq %rdi, %r9 -; X64-NEXT: addq %r15, %r9 -; X64-NEXT: addq %rdx, %r9 -; X64-NEXT: movq 112(%r8), %rax -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; X64-NEXT: imulq %r11, %rdx -; X64-NEXT: movq 120(%r8), %r8 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: imulq %r10, %r8 -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: addq %rsi, %r13 -; X64-NEXT: adcq %r9, %r8 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: movq %r10, %r9 -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq 96(%rdi), %rsi +; X64-NEXT: imulq %rsi, %r15 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %r8, %rcx +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %r15, %rdx +; X64-NEXT: movq 104(%rdi), %r9 +; X64-NEXT: imulq %r9, %rcx +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq 112(%rdi), %rax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: imulq %r12, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: movq 120(%rdi), %rdi +; X64-NEXT: imulq %r15, %rdi +; X64-NEXT: addq %rdx, %rdi +; X64-NEXT: addq %r10, %r8 +; X64-NEXT: adcq %r14, %rdi +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %r10, %r13 +; X64-NEXT: adcq $0, %r14 +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rsi, %r10 -; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r12 -; X64-NEXT: addq %r10, %r12 -; X64-NEXT: adcq %rcx, %rsi -; X64-NEXT: setb %cl -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %r13, %rbp +; X64-NEXT: adcq %r14, %rcx +; X64-NEXT: setb %sil +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rsi, %r14 -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: addq %rcx, %r14 +; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %r10 -; X64-NEXT: addq %r13, %r14 -; X64-NEXT: adcq %r8, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %rax, %rsi -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r8, %r14 +; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; X64-NEXT: imulq %r15, %rcx -; X64-NEXT: addq %rsi, %rcx -; X64-NEXT: addq %rdx, %rcx -; X64-NEXT: movq %rcx, %r8 -; X64-NEXT: movq %rbp, %rax +; X64-NEXT: imulq %r15, %rdi +; X64-NEXT: movq %r15, %rax ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: imulq %rsi, %rax -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: imulq %rdi, %rbx -; X64-NEXT: addq %rax, %rbx -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; X64-NEXT: imulq %r12, %rsi +; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: movq %rsi, %r8 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: imulq %r9, %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %rsi, %rdx +; X64-NEXT: imulq %r11, %rbx ; X64-NEXT: addq %rdx, %rbx -; X64-NEXT: addq %r9, %rcx +; X64-NEXT: addq %rcx, %r13 ; X64-NEXT: adcq %r8, %rbx -; X64-NEXT: movq %rbx, %rbp -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r9 -; X64-NEXT: mulq %r11 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r8, %rdi -; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r8, %rsi +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rdi, %r11 -; X64-NEXT: adcq %rsi, %r8 +; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: adcq %rdi, %r8 ; X64-NEXT: setb %sil -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %r15 +; X64-NEXT: movq %r9, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: addq %r8, %rax ; X64-NEXT: movzbl %sil, %esi ; X64-NEXT: adcq %rsi, %rdx -; X64-NEXT: addq %rcx, %rax -; X64-NEXT: adcq %rbp, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; X64-NEXT: adcq %r12, %r11 +; X64-NEXT: addq %r13, %rax +; X64-NEXT: adcq %rbx, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq %rbp, %r11 ; X64-NEXT: adcq %r14, %rax ; X64-NEXT: adcq %r10, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: movq %rcx, %rdi -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload ; X64-NEXT: movq %rsi, %r8 -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload +; X64-NEXT: movq %rdi, %r9 ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, (%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, 8(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, 16(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, 24(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, 32(%rcx) -; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, 40(%rcx) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, 48(%rcx) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; X64-NEXT: movq %rsi, 56(%rcx) -; X64-NEXT: movq %rdi, 64(%rcx) -; X64-NEXT: movq %r8, 72(%rcx) -; X64-NEXT: movq %r9, 80(%rcx) -; X64-NEXT: movq %r10, 88(%rcx) -; X64-NEXT: movq %r13, 96(%rcx) -; X64-NEXT: movq %r11, 104(%rcx) -; X64-NEXT: movq %rax, 112(%rcx) -; X64-NEXT: movq %rdx, 120(%rcx) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, (%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 8(%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 16(%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 24(%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 32(%rsi) +; X64-NEXT: movq (%rsp), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 40(%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 48(%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 56(%rsi) +; X64-NEXT: movq %r8, 64(%rsi) +; X64-NEXT: movq %r9, 72(%rsi) +; X64-NEXT: movq %r10, 80(%rsi) +; X64-NEXT: movq %rbx, 88(%rsi) +; X64-NEXT: movq %rcx, 96(%rsi) +; X64-NEXT: movq %r11, 104(%rsi) +; X64-NEXT: movq %rax, 112(%rsi) +; X64-NEXT: movq %rdx, 120(%rsi) ; X64-NEXT: addq $240, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll --- a/llvm/test/CodeGen/X86/mul-i256.ll +++ b/llvm/test/CodeGen/X86/mul-i256.ll @@ -23,141 +23,140 @@ ; X32-NEXT: .cfi_offset %ebp, -8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 12(%ecx), %ebp -; X32-NEXT: movl 8(%ecx), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl (%eax), %ebx +; X32-NEXT: movl 12(%ecx), %esi +; X32-NEXT: movl 8(%ecx), %ebx ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl (%eax), %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl %esi, %ecx +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 4(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl 4(%eax), %ebp +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi -; X32-NEXT: addl %ecx, %eax +; X32-NEXT: adcl %esi, %ebx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl (%edi), %esi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: mull %ebx +; X32-NEXT: movl (%edi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl 4(%edi), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ecx, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl 4(%edi), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb %bl -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %esi, %ebp -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %ebx, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %ecx, %ebp +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 8(%eax), %esi -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl 8(%eax), %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %esi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl 12(%eax), %ecx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebx, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %ecx +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %esi +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebx, %ecx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: adcl %ebx, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %ebx +; X32-NEXT: addl %esi, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %edx +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -167,41 +166,43 @@ ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl 16(%ecx), %edi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: imull %edi, %ebx +; X32-NEXT: movl %ebx, %esi +; X32-NEXT: imull %edi, %esi ; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl (%esp), %esi # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: addl %esi, %edx +; X32-NEXT: movl 20(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl 20(%ecx), %ebp -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: addl %ebx, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill +; X32-NEXT: imull %eax, %ebp +; X32-NEXT: addl %edx, %ebp ; X32-NEXT: movl 24(%ecx), %eax -; X32-NEXT: movl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %esi, %edx -; X32-NEXT: movl 28(%ecx), %ecx +; X32-NEXT: movl %eax, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 28(%ecx), %ecx +; X32-NEXT: imull %esi, %ecx ; X32-NEXT: addl %edx, %ecx -; X32-NEXT: mull %ebx -; X32-NEXT: addl %edx, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %ebp, %ecx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %ebx ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax @@ -213,84 +214,85 @@ ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movzbl %bl, %esi ; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 28(%edi), %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %ecx -; X32-NEXT: movl 24(%edi), %esi -; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl 24(%edi), %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: addl %ecx, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl 16(%edi), %ecx +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: movl 28(%edi), %eax +; X32-NEXT: imull %esi, %eax +; X32-NEXT: addl %eax, %ecx +; X32-NEXT: movl 16(%edi), %ebp ; X32-NEXT: movl 20(%edi), %ebx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx -; X32-NEXT: imull %ebx, %edx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: imull %ebx, %edi ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: imull %ecx, %edi -; X32-NEXT: addl %edx, %edi -; X32-NEXT: mull %ecx -; X32-NEXT: addl %edx, %edi +; X32-NEXT: mull %ebp +; X32-NEXT: addl %edi, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: imull %ebp, %esi +; X32-NEXT: addl %edx, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi +; X32-NEXT: adcl %ecx, %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %edi, %esi -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx +; X32-NEXT: adcl $0, %ecx +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx -; X32-NEXT: adcl %ebx, %edi -; X32-NEXT: setb %bl +; X32-NEXT: movl %eax, %esi +; X32-NEXT: addl %ebx, %esi +; X32-NEXT: adcl %ecx, %edi +; X32-NEXT: setb %cl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: addl %edi, %eax -; X32-NEXT: movzbl %bl, %esi -; X32-NEXT: adcl %esi, %edx +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: addl (%esp), %esi # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X32-NEXT: movl %esi, %ebx -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, %ebx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, (%esi) +; X32-NEXT: movl %edi, (%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 4(%esi) +; X32-NEXT: movl %edi, 4(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 8(%esi) +; X32-NEXT: movl %edi, 8(%ecx) ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, 12(%esi) -; X32-NEXT: movl %ebx, 16(%esi) -; X32-NEXT: movl %ecx, 20(%esi) -; X32-NEXT: movl %eax, 24(%esi) -; X32-NEXT: movl %edx, 28(%esi) +; X32-NEXT: movl %edi, 12(%ecx) +; X32-NEXT: movl %ebx, 16(%ecx) +; X32-NEXT: movl %esi, 20(%ecx) +; X32-NEXT: movl %eax, 24(%ecx) +; X32-NEXT: movl %edx, 28(%ecx) ; X32-NEXT: addl $72, %esp ; X32-NEXT: .cfi_def_cfa_offset 20 ; X32-NEXT: popl %esi @@ -309,12 +311,9 @@ ; X64-NEXT: .cfi_def_cfa_offset 16 ; X64-NEXT: pushq %r14 ; X64-NEXT: .cfi_def_cfa_offset 24 -; X64-NEXT: pushq %r12 -; X64-NEXT: .cfi_def_cfa_offset 32 ; X64-NEXT: pushq %rbx -; X64-NEXT: .cfi_def_cfa_offset 40 -; X64-NEXT: .cfi_offset %rbx, -40 -; X64-NEXT: .cfi_offset %r12, -32 +; X64-NEXT: .cfi_def_cfa_offset 32 +; X64-NEXT: .cfi_offset %rbx, -32 ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 ; X64-NEXT: movq %rdx, %rcx @@ -330,16 +329,16 @@ ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: imulq %r14, %r10 -; X64-NEXT: addq %r15, %r10 ; X64-NEXT: addq %rdx, %r10 -; X64-NEXT: movq %r8, %r12 -; X64-NEXT: imulq %r11, %r12 +; X64-NEXT: addq %r15, %r10 +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: imulq %r11, %r15 ; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r15, %rdx ; X64-NEXT: movq 24(%rsi), %r15 ; X64-NEXT: imulq %rbx, %r15 -; X64-NEXT: addq %r12, %r15 ; X64-NEXT: addq %rdx, %r15 ; X64-NEXT: addq %rdi, %r8 ; X64-NEXT: adcq %r10, %r15 @@ -372,8 +371,6 @@ ; X64-NEXT: movq %rax, 16(%rcx) ; X64-NEXT: movq %rdx, 24(%rcx) ; X64-NEXT: popq %rbx -; X64-NEXT: .cfi_def_cfa_offset 32 -; X64-NEXT: popq %r12 ; X64-NEXT: .cfi_def_cfa_offset 24 ; X64-NEXT: popq %r14 ; X64-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -33,6 +33,7 @@ ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %ecx, %ebp +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -46,74 +47,73 @@ ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 16(%ecx), %ebx -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 16(%ecx), %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl 20(%ecx), %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 20(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %edi ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %esi, %ecx ; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl %ebp, %esi -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %edi, %ebp ; X32-NEXT: setb %cl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ebp, %esi -; X32-NEXT: movzbl %cl, %eax -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movzbl %cl, %ecx +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 8(%eax), %ecx -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl 8(%edi), %ebp +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %ebp -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 12(%eax), %ecx -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl 12(%edi), %ecx +; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx +; X32-NEXT: movl %eax, %esi ; X32-NEXT: adcl %ebp, %edi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx ; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload @@ -125,7 +125,7 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ebp @@ -145,7 +145,7 @@ ; X32-NEXT: addl %esi, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X32-NEXT: adcl %esi, %edx -; X32-NEXT: addl %ecx, (%esp) # 4-byte Folded Spill +; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload @@ -154,9 +154,9 @@ ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 8(%ecx), %edi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl 8(%ecx), %ebx +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -166,20 +166,19 @@ ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ebp, %ebx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %edi -; X32-NEXT: movl %edi, %ebx +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload @@ -192,66 +191,66 @@ ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl 4(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ebp -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %ebx ; X32-NEXT: movl %ebx, %esi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebp, %ecx -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %eax, %esi ; X32-NEXT: addl %ecx, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %ebp +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebp +; X32-NEXT: movl %edx, %edi ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %ebp +; X32-NEXT: adcl %ebp, %edi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebp, %edi +; X32-NEXT: movl %eax, %ebp +; X32-NEXT: addl %edi, %ebp ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %edi +; X32-NEXT: adcl $0, %ebp ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -263,28 +262,27 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %edi, %esi +; X32-NEXT: setb %bl +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: adcl %ecx, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: movl %ebp, %edx +; X32-NEXT: addl %ebp, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X32-NEXT: adcl %edi, %eax @@ -297,109 +295,111 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 16(%eax), %ebp -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: movl 16(%eax), %esi +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %ecx, %ebx -; X32-NEXT: adcl $0, %edi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ecx, %edi +; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 20(%eax), %ecx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax ; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %ebx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx -; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: addl %ecx, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl %bl, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %esi, %ebp -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ebp, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb %bl ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %esi, %ecx +; X32-NEXT: addl %ebp, %ecx ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 24(%eax), %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl 24(%eax), %ebx +; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, %edi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl 28(%eax), %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi -; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl 28(%eax), %ebx +; X32-NEXT: movl %esi, %eax +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, %edi -; X32-NEXT: adcl %ebp, %ebx +; X32-NEXT: adcl %ebp, %esi ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %ebx, %ebp +; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %ebp +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %esi, %ebx ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebp +; X32-NEXT: adcl $0, %ebx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill @@ -407,29 +407,31 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi -; X32-NEXT: addl %ebx, %eax -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: adcl %edi, %esi +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ebp, %ecx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %ebp +; X32-NEXT: movl %edx, %ebp +; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: adcl %esi, %ebp ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: addl %esi, %eax +; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: addl %ebp, %edi +; X32-NEXT: addl %ebx, %edi +; X32-NEXT: movl (%esp), %ebx # 4-byte Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X32-NEXT: adcl %ecx, %eax @@ -445,20 +447,21 @@ ; X32-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl %edi, %edx ; X32-NEXT: adcl $0, %edx -; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ebx, %ecx +; X32-NEXT: adcl $0, %ecx ; X32-NEXT: adcl $0, %eax ; X32-NEXT: adcl $0, %esi -; X32-NEXT: addl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %ecx @@ -467,80 +470,81 @@ ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi -; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ecx, %edi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx -; X32-NEXT: addl %edi, %eax +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %ebx -; X32-NEXT: setb (%esp) # 1-byte Folded Spill +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: setb %bl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: addl %ebx, %eax +; X32-NEXT: mull %edi +; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: addl %ecx, %ebx +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %edi -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %edi, %ecx +; X32-NEXT: adcl %esi, %edi ; X32-NEXT: setb %bl ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %esi +; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl %ecx, %esi +; X32-NEXT: addl %edi, %esi ; X32-NEXT: movzbl %bl, %eax ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X32-NEXT: movl %ebx, %eax +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %ebp, %eax -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %edi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: mull %edi +; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %edi, %ebx +; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %ebx ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl %ebx, %edi +; X32-NEXT: addl %ecx, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx ; X32-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -549,59 +553,58 @@ ; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: setb (%esp) # 1-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: movl %esi, %eax +; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %ebx +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %esi +; X32-NEXT: adcl $0, %esi +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl %ebx, %ecx +; X32-NEXT: mull %ebx +; X32-NEXT: movl %edx, %ebx ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: adcl %ebx, %esi +; X32-NEXT: adcl %esi, %ebx ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: addl %esi, %ebx -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: movl %edx, %esi -; X32-NEXT: adcl %eax, %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X32-NEXT: addl %edi, %edx +; X32-NEXT: addl %ebx, %eax +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X32-NEXT: adcl %ecx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: addl %edi, %ecx ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %ebx -; X32-NEXT: adcl $0, %esi +; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X32-NEXT: adcl %esi, %eax +; X32-NEXT: movl %eax, %esi +; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl %eax, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %ebp ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl $0, %edx +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 32(%eax), %edi ; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -609,7 +612,7 @@ ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: movl %ebp, %eax ; X32-NEXT: mull %edi @@ -621,7 +624,7 @@ ; X32-NEXT: movl 36(%eax), %ecx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %ebx, %eax @@ -649,7 +652,7 @@ ; X32-NEXT: addl %ecx, %ebp ; X32-NEXT: adcl $0, %edi ; X32-NEXT: movl %esi, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl (%esp), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: addl %ebp, %eax @@ -662,14 +665,14 @@ ; X32-NEXT: addl %esi, %edi ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl 40(%eax), %esi -; X32-NEXT: movl %esi, (%esp) # 4-byte Spill +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi @@ -708,7 +711,7 @@ ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -740,115 +743,120 @@ ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %edi -; X32-NEXT: movl (%esp), %ecx # 4-byte Reload +; X32-NEXT: movl %edi, %esi +; X32-NEXT: imull %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: mull %ecx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: imull %ebx, %ecx -; X32-NEXT: addl %edi, %ecx ; X32-NEXT: addl %edx, %ecx -; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl (%esp), %edi # 4-byte Reload +; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: mull %ebp +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %esi, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: imull %ebp, %esi +; X32-NEXT: addl %edx, %esi +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ecx, %edi -; X32-NEXT: addl %edx, %edi ; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl %edx, %edi -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload -; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: mull %edi -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, %ebp ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %edi +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %edi -; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ecx, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx +; X32-NEXT: movl %ebx, %ebp ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: addl %edi, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %ecx ; X32-NEXT: setb %bl -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: movl (%esp), %eax # 4-byte Reload +; X32-NEXT: mull %ebp ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx -; X32-NEXT: addl %ebp, %eax +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl 60(%edi), %ecx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %ecx -; X32-NEXT: movl 56(%edi), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl 56(%ebx), %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: addl %ecx, %esi +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X32-NEXT: addl %edx, %esi -; X32-NEXT: movl 48(%edi), %ecx -; X32-NEXT: movl 52(%edi), %ebx +; X32-NEXT: movl 60(%ebx), %eax +; X32-NEXT: imull %ecx, %eax +; X32-NEXT: addl %eax, %esi +; X32-NEXT: movl 48(%ebx), %edi +; X32-NEXT: movl 52(%ebx), %ebp ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx -; X32-NEXT: imull %ebx, %edx -; X32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: imull %ecx, %edi -; X32-NEXT: addl %edx, %edi -; X32-NEXT: mull %ecx -; X32-NEXT: addl %edx, %edi -; X32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X32-NEXT: movl %eax, %ebx +; X32-NEXT: imull %ebp, %ebx +; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: mull %edi +; X32-NEXT: addl %ebx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: imull %edi, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %esi, %edi -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: mull %esi -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: adcl %esi, %ecx +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X32-NEXT: mull %ecx +; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: mull %esi +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %ecx ; X32-NEXT: movl %edx, %ebx -; X32-NEXT: movl %eax, %esi -; X32-NEXT: addl (%esp), %esi # 4-byte Folded Reload -; X32-NEXT: adcl $0, %ebx -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebp -; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: addl %esi, %ebp +; X32-NEXT: adcl $0, %ebx +; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ecx +; X32-NEXT: movl %eax, %edi +; X32-NEXT: addl %ebp, %edi ; X32-NEXT: adcl %ebx, %ecx ; X32-NEXT: setb %bl ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X32-NEXT: mull %esi ; X32-NEXT: addl %ecx, %eax ; X32-NEXT: movzbl %bl, %ecx ; X32-NEXT: adcl %ecx, %edx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X32-NEXT: adcl %edi, %edx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload @@ -880,10 +888,10 @@ ; X32-NEXT: movl %edi, %eax ; X32-NEXT: mull %ebp ; X32-NEXT: addl %esi, %eax -; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %eax, (%esp) # 4-byte Spill ; X32-NEXT: movzbl %cl, %eax ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl 32(%ecx), %esi ; X32-NEXT: movl %esi, %eax @@ -917,8 +925,8 @@ ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X32-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -951,20 +959,20 @@ ; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edi ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X32-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X32-NEXT: movl %esi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx -; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %eax, (%esp) # 4-byte Spill +; X32-NEXT: movl %edx, (%esp) # 4-byte Spill +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %ebp -; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ebx ; X32-NEXT: movl %esi, %eax ; X32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload @@ -972,60 +980,60 @@ ; X32-NEXT: addl %ebp, %eax ; X32-NEXT: movl %eax, %ebp ; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X32-NEXT: setb (%esp) # 1-byte Folded Spill ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: addl %esi, %eax ; X32-NEXT: movl %eax, %esi -; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X32-NEXT: movzbl (%esp), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %edx -; X32-NEXT: addl %edi, (%esp) # 4-byte Folded Spill +; X32-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebp, (%esp) # 4-byte Spill ; X32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X32-NEXT: adcl %eax, %esi ; X32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl $0, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl 48(%ecx), %ebp +; X32-NEXT: movl 48(%ecx), %edi ; X32-NEXT: movl %ebx, %esi -; X32-NEXT: imull %ebp, %esi -; X32-NEXT: movl %ebp, %eax +; X32-NEXT: imull %edi, %esi +; X32-NEXT: movl %edi, %eax ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X32-NEXT: mull %ebx ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: addl %esi, %edx ; X32-NEXT: movl 52(%ecx), %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: imull %eax, %ebx -; X32-NEXT: addl %esi, %ebx ; X32-NEXT: addl %edx, %ebx ; X32-NEXT: movl 56(%ecx), %eax -; X32-NEXT: movl %ecx, %esi -; X32-NEXT: movl %eax, %edx +; X32-NEXT: movl %eax, %esi +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: imull %ecx, %edx +; X32-NEXT: mull %ecx +; X32-NEXT: addl %esi, %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl 60(%esi), %esi -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: imull %edi, %esi -; X32-NEXT: addl %edx, %esi -; X32-NEXT: mull %edi +; X32-NEXT: imull %ecx, %esi ; X32-NEXT: addl %edx, %esi ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %ebx, %esi -; X32-NEXT: movl %edi, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ecx, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %ecx, %eax -; X32-NEXT: mull %ebp +; X32-NEXT: movl %ebp, %eax +; X32-NEXT: mull %edi ; X32-NEXT: movl %edx, %ecx ; X32-NEXT: movl %eax, %ebx ; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X32-NEXT: adcl $0, %ecx -; X32-NEXT: movl %edi, %eax +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %edi @@ -1042,43 +1050,43 @@ ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: adcl %esi, %edx ; X32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: imull %eax, %edi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %eax, %ebx -; X32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X32-NEXT: addl %edi, %ecx -; X32-NEXT: addl %edx, %ecx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X32-NEXT: movl %eax, %edx -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X32-NEXT: imull %esi, %edx +; X32-NEXT: imull %eax, %ecx +; X32-NEXT: movl %eax, %esi ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X32-NEXT: imull %edi, %ebp -; X32-NEXT: addl %edx, %ebp ; X32-NEXT: mull %edi -; X32-NEXT: addl %edx, %ebp -; X32-NEXT: addl %ebx, %eax ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl %ecx, %ebp -; X32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %edi, %eax -; X32-NEXT: movl %edi, %ebx +; X32-NEXT: addl %ecx, %edx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: imull %ebp, %edi +; X32-NEXT: addl %edx, %edi +; X32-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X32-NEXT: movl %eax, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X32-NEXT: imull %edi, %ecx +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X32-NEXT: mull %ebx +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X32-NEXT: mull %ecx -; X32-NEXT: movl %edx, %edi +; X32-NEXT: imull %ebx, %ecx +; X32-NEXT: addl %edx, %ecx +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: movl %esi, %eax -; X32-NEXT: mull %ecx +; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %ebx, %eax +; X32-NEXT: mull %esi +; X32-NEXT: movl %edx, %ebx +; X32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X32-NEXT: movl %edi, %eax +; X32-NEXT: mull %esi ; X32-NEXT: movl %edx, %esi ; X32-NEXT: movl %eax, %ecx -; X32-NEXT: addl %edi, %ecx +; X32-NEXT: addl %ebx, %ecx ; X32-NEXT: adcl $0, %esi -; X32-NEXT: movl %ebx, %eax -; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: mull %ebp ; X32-NEXT: movl %edx, %ebx ; X32-NEXT: movl %eax, %edi @@ -1097,9 +1105,9 @@ ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X32-NEXT: addl (%esp), %ecx # 4-byte Folded Reload +; X32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X32-NEXT: adcl (%esp), %edi # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X32-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload @@ -1171,270 +1179,272 @@ ; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: pushq %rax +; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq (%rdi), %r14 +; X64-NEXT: movq (%rdi), %rbx ; X64-NEXT: movq 8(%rdi), %r9 -; X64-NEXT: movq 24(%rdi), %r15 -; X64-NEXT: movq 16(%rdi), %rax -; X64-NEXT: movq (%rsi), %rdi -; X64-NEXT: movq 8(%rsi), %rbx -; X64-NEXT: movq %rsi, %r12 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rdi +; X64-NEXT: movq 24(%rdi), %r12 +; X64-NEXT: movq 16(%rdi), %r14 +; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq (%rsi), %rcx +; X64-NEXT: movq 8(%rsi), %r11 +; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %rcx, %rsi +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %r12, %rax +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rsi, %r15 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rcx, %r10 ; X64-NEXT: adcq $0, %r8 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r10, %r11 -; X64-NEXT: adcq %r8, %rcx +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r10, %rcx +; X64-NEXT: adcq %r8, %r14 ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %esi -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: addq %r14, %r10 ; X64-NEXT: adcq %rsi, %r13 -; X64-NEXT: movq %r14, %rsi -; X64-NEXT: movq %r14, %rax -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rcx, %r15 -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: movq %rsi, %r8 -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r14, %r15 +; X64-NEXT: adcq $0, %rbp +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbx, %r12 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r11 +; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: addq %r15, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %rbp +; X64-NEXT: adcq %rbp, %rbx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r9, %rdi ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rbp, %rcx +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rbx, %rbp ; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: adcq %rax, %r14 -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq %r11, %r14 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; X64-NEXT: adcq %rcx, %r14 ; X64-NEXT: adcq $0, %r10 ; X64-NEXT: adcq $0, %r13 -; X64-NEXT: movq %r12, %r9 -; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq 16(%r12), %rsi -; X64-NEXT: movq %r8, %rbx -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r8, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rdi, %rsi ; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq 16(%rdi), %r8 +; X64-NEXT: movq %r12, %r11 +; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r12, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r9, %r12 +; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %rcx, %r15 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq 24(%rsi), %rsi +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %r11, %rbp -; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq 24(%r9), %rdi -; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rbp, %rbx -; X64-NEXT: adcq %r15, %r9 -; X64-NEXT: setb %bpl +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r15, %r11 +; X64-NEXT: adcq %rbx, %r9 +; X64-NEXT: setb %bl ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rdi +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r9, %r11 -; X64-NEXT: movzbl %bpl, %eax +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: addq %r9, %rcx +; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %r15 -; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r14, %rbx -; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq $0, %r11 +; X64-NEXT: addq %rbp, %rdi +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r14, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq $0, %rcx ; X64-NEXT: adcq $0, %r15 -; X64-NEXT: addq %r10, %r11 +; X64-NEXT: addq %r10, %rcx ; X64-NEXT: adcq %r13, %r15 -; X64-NEXT: setb %bpl +; X64-NEXT: setb %r12b +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r11 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; X64-NEXT: movq %r13, %rax +; X64-NEXT: addq %rdi, %rbx +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rcx, %r9 -; X64-NEXT: adcq $0, %r8 +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: adcq %r9, %rbp +; X64-NEXT: setb %dil ; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: addq %r9, %rax -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: adcq %r8, %rcx -; X64-NEXT: setb %r8b -; X64-NEXT: movq %r13, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rcx, %r14 -; X64-NEXT: movzbl %r8b, %eax -; X64-NEXT: adcq %rax, %rdx -; X64-NEXT: addq %r11, %rbx +; X64-NEXT: mulq %rsi +; X64-NEXT: addq %rbp, %rax +; X64-NEXT: movzbl %dil, %edi +; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: adcq %r15, %rbx ; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: adcq %r15, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movzbl %bpl, %eax -; X64-NEXT: adcq %rax, %r14 +; X64-NEXT: movzbl %r12b, %ecx +; X64-NEXT: adcq %rcx, %rax +; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: adcq $0, %rdx ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 32(%rcx), %r11 -; X64-NEXT: imulq %r11, %rdi -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: movq 32(%rcx), %r15 +; X64-NEXT: imulq %r15, %rsi +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq 40(%rcx), %r8 -; X64-NEXT: imulq %r8, %rsi -; X64-NEXT: addq %rdi, %rsi -; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: addq %rsi, %rdx +; X64-NEXT: movq 40(%rcx), %rsi +; X64-NEXT: imulq %rsi, %r8 +; X64-NEXT: addq %rdx, %r8 ; X64-NEXT: movq 48(%rcx), %rax -; X64-NEXT: movq %rcx, %rdx -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; X64-NEXT: imulq %r10, %rcx -; X64-NEXT: movq 56(%rdx), %rbp -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: imulq %rdi, %rbp -; X64-NEXT: addq %rcx, %rbp -; X64-NEXT: mulq %rdi +; X64-NEXT: movq %rcx, %r11 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: imulq %r14, %rdi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rdx, %rbp +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: movq 56(%r11), %r11 +; X64-NEXT: imulq %rbx, %r11 +; X64-NEXT: addq %rdx, %r11 ; X64-NEXT: addq %r9, %rcx -; X64-NEXT: adcq %rsi, %rbp -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %rsi -; X64-NEXT: mulq %r11 +; X64-NEXT: adcq %r8, %r11 +; X64-NEXT: movq %rbx, %rax +; X64-NEXT: movq %rbx, %r8 +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %r15 ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rdi, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rdi, %rbx ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r15 -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %r11, %rbx +; X64-NEXT: movq %rax, %r13 +; X64-NEXT: addq %rbx, %r13 ; X64-NEXT: adcq %r9, %r15 -; X64-NEXT: setb %sil -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %r8 +; X64-NEXT: setb %dil +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %r15, %r8 -; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: adcq %rax, %r12 ; X64-NEXT: addq %rcx, %r8 -; X64-NEXT: adcq %rbp, %r12 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: movq 56(%rcx), %rsi -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: imulq %rax, %rsi -; X64-NEXT: movq 48(%rcx), %r11 -; X64-NEXT: movq %rcx, %rdi -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; X64-NEXT: imulq %rcx, %r11 -; X64-NEXT: addq %rsi, %r11 -; X64-NEXT: addq %rdx, %r11 -; X64-NEXT: movq 32(%rdi), %r9 -; X64-NEXT: movq 40(%rdi), %rdi +; X64-NEXT: adcq %r11, %r12 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload +; X64-NEXT: movq 48(%r9), %rsi +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; X64-NEXT: imulq %r14, %rsi +; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: movq %r9, %rdx +; X64-NEXT: movq 56(%r9), %rax +; X64-NEXT: imulq %rdi, %rax +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: addq %rax, %rsi +; X64-NEXT: movq 32(%r9), %r9 +; X64-NEXT: movq 40(%rdx), %r15 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; X64-NEXT: movq %rax, %rdx -; X64-NEXT: imulq %rdi, %rdx -; X64-NEXT: imulq %r9, %r13 -; X64-NEXT: addq %rdx, %r13 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: imulq %r15, %r11 ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rdx, %r13 -; X64-NEXT: addq %r10, %r15 -; X64-NEXT: adcq %r11, %r13 +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: addq %r11, %rdx +; X64-NEXT: imulq %r9, %r10 +; X64-NEXT: addq %rdx, %r10 +; X64-NEXT: addq %rcx, %rdi +; X64-NEXT: adcq %rsi, %r10 ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %rbp +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %r10, %r11 +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rsi, %rbx ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r10 -; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %r11, %rcx -; X64-NEXT: adcq %rbp, %r9 -; X64-NEXT: setb %r11b -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r10 -; X64-NEXT: addq %r9, %rax -; X64-NEXT: movzbl %r11b, %edi -; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: addq %r15, %rax -; X64-NEXT: adcq %r13, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq %rbx, %rcx +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rbx, %r9 +; X64-NEXT: adcq %rbp, %rsi +; X64-NEXT: setb %bl +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r14 +; X64-NEXT: addq %rsi, %rax +; X64-NEXT: movzbl %bl, %esi +; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: addq %rdi, %rax +; X64-NEXT: adcq %r10, %rdx +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq %r13, %r9 ; X64-NEXT: adcq %r8, %rax ; X64-NEXT: adcq %r12, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; X64-NEXT: adcq %r14, %rax +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload +; X64-NEXT: movq (%rsp), %rsi # 8-byte Reload +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, (%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 8(%rsi) +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; X64-NEXT: movq %rdi, 16(%rsi) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, (%rdi) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, 8(%rdi) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, 16(%rdi) -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; X64-NEXT: movq %r8, 24(%rdi) -; X64-NEXT: movq %rsi, 32(%rdi) -; X64-NEXT: movq %rcx, 40(%rdi) -; X64-NEXT: movq %rax, 48(%rdi) -; X64-NEXT: movq %rdx, 56(%rdi) +; X64-NEXT: movq %rdi, 24(%rsi) +; X64-NEXT: movq %rcx, 32(%rsi) +; X64-NEXT: movq %r9, 40(%rsi) +; X64-NEXT: movq %rax, 48(%rsi) +; X64-NEXT: movq %rdx, 56(%rsi) +; X64-NEXT: addq $8, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll --- a/llvm/test/CodeGen/X86/mul128.ll +++ b/llvm/test/CodeGen/X86/mul128.ll @@ -9,8 +9,8 @@ ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: imulq %rdi, %rcx ; X64-NEXT: mulq %rdx +; X64-NEXT: addq %rcx, %rdx ; X64-NEXT: imulq %rsi, %r8 -; X64-NEXT: addq %rcx, %r8 ; X64-NEXT: addq %r8, %rdx ; X64-NEXT: retq ; @@ -24,68 +24,71 @@ ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $8, %esp -; X86-NEXT: .cfi_def_cfa_offset 28 +; X86-NEXT: subl $12, %esp +; X86-NEXT: .cfi_def_cfa_offset 32 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: imull %ebp, %eax +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: imull %ebp, %esi -; X86-NEXT: addl %eax, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edi, %edx -; X86-NEXT: imull {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %esi +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: imull %esi, %ecx ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: mull %edi -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebp, %edi -; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: adcl %edi, %ebp ; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl %bl, %esi -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movzbl %bl, %edi +; X86-NEXT: adcl %edi, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %edi, 4(%ecx) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: movl %eax, 8(%ecx) -; X86-NEXT: movl %edx, 12(%ecx) -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 4(%esi) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, (%esi) +; X86-NEXT: movl %eax, 8(%esi) +; X86-NEXT: movl %edx, 12(%esi) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/mul64.ll b/llvm/test/CodeGen/X86/mul64.ll --- a/llvm/test/CodeGen/X86/mul64.ll +++ b/llvm/test/CodeGen/X86/mul64.ll @@ -11,8 +11,8 @@ ; X32-NEXT: movl %ecx, %eax ; X32-NEXT: mull %esi ; X32-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X32-NEXT: addl %ecx, %edx ; X32-NEXT: imull {{[0-9]+}}(%esp), %esi -; X32-NEXT: addl %ecx, %esi ; X32-NEXT: addl %esi, %edx ; X32-NEXT: popl %esi ; X32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll --- a/llvm/test/CodeGen/X86/muloti.ll +++ b/llvm/test/CodeGen/X86/muloti.ll @@ -13,52 +13,54 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 24 ; CHECK-NEXT: .cfi_offset %rbx, -24 ; CHECK-NEXT: .cfi_offset %r14, -16 -; CHECK-NEXT: movq %rdx, %r10 -; CHECK-NEXT: movq %rdi, %r9 +; CHECK-NEXT: movq %rdx, %r11 +; CHECK-NEXT: movq %rdi, %r10 ; CHECK-NEXT: movq %rsi, %rdi ; CHECK-NEXT: sarq $63, %rdi -; CHECK-NEXT: movq %rcx, %r11 -; CHECK-NEXT: imulq %rdi, %r11 +; CHECK-NEXT: movq %rcx, %r8 +; CHECK-NEXT: imulq %rdi, %r8 ; CHECK-NEXT: movq %rdx, %rax ; CHECK-NEXT: mulq %rdi -; CHECK-NEXT: movq %rax, %rdi -; CHECK-NEXT: addq %rax, %r11 -; CHECK-NEXT: addq %rdx, %r11 +; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: movq %rax, %rbx +; CHECK-NEXT: addq %rax, %rdi +; CHECK-NEXT: addq %r8, %rdi ; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: sarq $63, %rax ; CHECK-NEXT: movq %rax, %r14 ; CHECK-NEXT: imulq %rsi, %r14 -; CHECK-NEXT: mulq %r9 -; CHECK-NEXT: movq %rax, %r8 -; CHECK-NEXT: addq %rax, %r14 -; CHECK-NEXT: addq %rdx, %r14 -; CHECK-NEXT: addq %rdi, %r8 -; CHECK-NEXT: adcq %r11, %r14 -; CHECK-NEXT: movq %r9, %rax ; CHECK-NEXT: mulq %r10 -; CHECK-NEXT: movq %rdx, %r11 +; CHECK-NEXT: movq %rax, %r9 +; CHECK-NEXT: movq %rdx, %r8 +; CHECK-NEXT: addq %r14, %r8 +; CHECK-NEXT: addq %rax, %r8 +; CHECK-NEXT: addq %rbx, %r9 +; CHECK-NEXT: adcq %rdi, %r8 +; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: mulq %r11 +; CHECK-NEXT: movq %rdx, %rbx ; CHECK-NEXT: movq %rax, %rdi ; CHECK-NEXT: movq %rsi, %rax -; CHECK-NEXT: mulq %r10 -; CHECK-NEXT: movq %rdx, %r10 -; CHECK-NEXT: movq %rax, %rbx -; CHECK-NEXT: addq %r11, %rbx -; CHECK-NEXT: adcq $0, %r10 -; CHECK-NEXT: movq %r9, %rax -; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: mulq %r11 ; CHECK-NEXT: movq %rdx, %r11 -; CHECK-NEXT: movq %rax, %r9 -; CHECK-NEXT: addq %rbx, %r9 -; CHECK-NEXT: adcq %r10, %r11 +; CHECK-NEXT: movq %rax, %r14 +; CHECK-NEXT: addq %rbx, %r14 +; CHECK-NEXT: adcq $0, %r11 +; CHECK-NEXT: movq %r10, %rax +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rax, %r10 +; CHECK-NEXT: addq %r14, %r10 +; CHECK-NEXT: adcq %r11, %rbx ; CHECK-NEXT: setb %al -; CHECK-NEXT: movzbl %al, %r10d +; CHECK-NEXT: movzbl %al, %r11d ; CHECK-NEXT: movq %rsi, %rax ; CHECK-NEXT: mulq %rcx -; CHECK-NEXT: addq %r11, %rax -; CHECK-NEXT: adcq %r10, %rdx -; CHECK-NEXT: addq %r8, %rax -; CHECK-NEXT: adcq %r14, %rdx -; CHECK-NEXT: movq %r9, %rcx +; CHECK-NEXT: addq %rbx, %rax +; CHECK-NEXT: adcq %r11, %rdx +; CHECK-NEXT: addq %r9, %rax +; CHECK-NEXT: adcq %r8, %rdx +; CHECK-NEXT: movq %r10, %rcx ; CHECK-NEXT: sarq $63, %rcx ; CHECK-NEXT: xorq %rcx, %rdx ; CHECK-NEXT: xorq %rax, %rcx @@ -66,7 +68,7 @@ ; CHECK-NEXT: jne LBB0_1 ; CHECK-NEXT: ## %bb.2: ## %nooverflow ; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: movq %r9, %rdx +; CHECK-NEXT: movq %r10, %rdx ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll --- a/llvm/test/CodeGen/X86/optimize-max-0.ll +++ b/llvm/test/CodeGen/X86/optimize-max-0.ll @@ -171,9 +171,10 @@ ; CHECK-NEXT: jne LBB0_17 ; CHECK-NEXT: LBB0_18: ## %bb26 ; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; CHECK-NEXT: addl %ecx, %esi ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx -; CHECK-NEXT: addl %ecx, %edx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; CHECK-NEXT: addl %esi, %edx ; CHECK-NEXT: jmp LBB0_23 ; CHECK-NEXT: LBB0_19: ## %bb29 ; CHECK-NEXT: testl %edx, %edx @@ -605,9 +606,15 @@ ; CHECK-NEXT: jne LBB1_17 ; CHECK-NEXT: LBB1_18: ## %bb26 ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: shrl %ecx +; CHECK-NEXT: subl $4, %esp +; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: pushl $128 +; CHECK-NEXT: pushl %edx ; CHECK-NEXT: jmp LBB1_23 ; CHECK-NEXT: LBB1_19: ## %bb29 ; CHECK-NEXT: testl %ebp, %ebp @@ -638,12 +645,12 @@ ; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: LBB1_23: ## %bb33 ; CHECK-NEXT: shrl %eax ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: pushl %eax ; CHECK-NEXT: pushl $128 ; CHECK-NEXT: pushl %ecx +; CHECK-NEXT: LBB1_23: ## %bb33 ; CHECK-NEXT: calll _memset ; CHECK-NEXT: addl $44, %esp ; CHECK-NEXT: LBB1_25: ## %return diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -352,23 +352,23 @@ ; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; X86-NOSSE-NEXT: imull $16843009, %edi, %edx # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %edx -; X86-NOSSE-NEXT: addl %esi, %edx -; X86-NOSSE-NEXT: movl %ecx, %esi -; X86-NOSSE-NEXT: shrl %esi -; X86-NOSSE-NEXT: andl $1431655765, %esi # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %esi, %ecx -; X86-NOSSE-NEXT: movl %ecx, %esi -; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: shrl %edi +; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %ecx ; X86-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NOSSE-NEXT: addl %esi, %ecx -; X86-NOSSE-NEXT: movl %ecx, %esi -; X86-NOSSE-NEXT: shrl $4, %esi -; X86-NOSSE-NEXT: addl %ecx, %esi -; X86-NOSSE-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: imull $16843009, %esi, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: addl %edi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %ecx, %edi +; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edi, %ecx # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %ecx ; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: addl %esi, %ecx ; X86-NOSSE-NEXT: movl %ecx, (%eax) ; X86-NOSSE-NEXT: movl $0, 12(%eax) ; X86-NOSSE-NEXT: movl $0, 8(%eax) @@ -420,18 +420,20 @@ ; ; X86-POPCNT-LABEL: cnt128: ; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: pushl %esi ; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx ; X86-POPCNT-NEXT: addl %ecx, %edx ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X86-POPCNT-NEXT: addl %edx, %ecx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx -; X86-POPCNT-NEXT: addl %ecx, %edx -; X86-POPCNT-NEXT: movl %edx, (%eax) +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi +; X86-POPCNT-NEXT: addl %ecx, %esi +; X86-POPCNT-NEXT: addl %edx, %esi +; X86-POPCNT-NEXT: movl %esi, (%eax) ; X86-POPCNT-NEXT: movl $0, 12(%eax) ; X86-POPCNT-NEXT: movl $0, 8(%eax) ; X86-POPCNT-NEXT: movl $0, 4(%eax) +; X86-POPCNT-NEXT: popl %esi ; X86-POPCNT-NEXT: retl $4 ; ; X64-POPCNT-LABEL: cnt128: @@ -798,80 +800,82 @@ ; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NOSSE-NEXT: movl %ebx, %eax -; X86-NOSSE-NEXT: shrl %eax -; X86-NOSSE-NEXT: movl $1431655765, %ecx # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %ecx, %eax -; X86-NOSSE-NEXT: subl %eax, %ebx -; X86-NOSSE-NEXT: movl $858993459, %eax # imm = 0x33333333 -; X86-NOSSE-NEXT: movl %ebx, %edi -; X86-NOSSE-NEXT: andl %eax, %edi +; X86-NOSSE-NEXT: movl %ebx, %ecx +; X86-NOSSE-NEXT: shrl %ecx +; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %edi, %ecx +; X86-NOSSE-NEXT: subl %ecx, %ebx +; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: movl %ebx, %ebp +; X86-NOSSE-NEXT: andl %ecx, %ebp ; X86-NOSSE-NEXT: shrl $2, %ebx -; X86-NOSSE-NEXT: andl %eax, %ebx -; X86-NOSSE-NEXT: addl %edi, %ebx -; X86-NOSSE-NEXT: movl %ebx, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %ebx, %edi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: shrl %ebx ; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: subl %ebx, %esi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: andl %eax, %ebx -; X86-NOSSE-NEXT: shrl $2, %esi -; X86-NOSSE-NEXT: andl %eax, %esi -; X86-NOSSE-NEXT: addl %ebx, %esi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: shrl $4, %ebx -; X86-NOSSE-NEXT: addl %esi, %ebx -; X86-NOSSE-NEXT: movl $252645135, %esi # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: andl %esi, %edi -; X86-NOSSE-NEXT: imull $16843009, %edi, %ebp # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %ebp -; X86-NOSSE-NEXT: andl %esi, %ebx -; X86-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edi -; X86-NOSSE-NEXT: addl %ebp, %edi -; X86-NOSSE-NEXT: movl %edx, %ebx +; X86-NOSSE-NEXT: addl %ebp, %ebx +; X86-NOSSE-NEXT: movl %ebx, %ebp +; X86-NOSSE-NEXT: shrl $4, %ebp +; X86-NOSSE-NEXT: addl %ebx, %ebp +; X86-NOSSE-NEXT: movl %eax, %ebx ; X86-NOSSE-NEXT: shrl %ebx +; X86-NOSSE-NEXT: andl %edi, %ebx +; X86-NOSSE-NEXT: subl %ebx, %eax +; X86-NOSSE-NEXT: movl %eax, %ebx ; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: subl %ebx, %edx -; X86-NOSSE-NEXT: movl %edx, %ebx -; X86-NOSSE-NEXT: andl %eax, %ebx -; X86-NOSSE-NEXT: shrl $2, %edx -; X86-NOSSE-NEXT: andl %eax, %edx -; X86-NOSSE-NEXT: addl %ebx, %edx -; X86-NOSSE-NEXT: movl %edx, %ebp +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: addl %ebx, %eax +; X86-NOSSE-NEXT: movl %eax, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %eax, %edi +; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: andl %ebx, %ebp +; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: andl %ebx, %edi +; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edi +; X86-NOSSE-NEXT: addl %eax, %edi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %ebp, %eax +; X86-NOSSE-NEXT: subl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: addl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %ebp ; X86-NOSSE-NEXT: shrl $4, %ebp -; X86-NOSSE-NEXT: addl %edx, %ebp -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NOSSE-NEXT: andl %esi, %ebp -; X86-NOSSE-NEXT: imull $16843009, %ebp, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: addl %esi, %ebp +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %esi, %eax +; X86-NOSSE-NEXT: subl %eax, %edx +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: shrl $2, %edx +; X86-NOSSE-NEXT: andl %ecx, %edx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: shrl $4, %eax +; X86-NOSSE-NEXT: addl %edx, %eax +; X86-NOSSE-NEXT: andl %ebx, %ebp +; X86-NOSSE-NEXT: andl %ebx, %eax +; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ecx +; X86-NOSSE-NEXT: imull $16843009, %eax, %edx # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %edx -; X86-NOSSE-NEXT: addl %edi, %edx -; X86-NOSSE-NEXT: movl %ebx, %edi -; X86-NOSSE-NEXT: shrl %edi -; X86-NOSSE-NEXT: andl %ecx, %edi -; X86-NOSSE-NEXT: subl %edi, %ebx -; X86-NOSSE-NEXT: movl %ebx, %ecx -; X86-NOSSE-NEXT: andl %eax, %ecx -; X86-NOSSE-NEXT: shrl $2, %ebx -; X86-NOSSE-NEXT: andl %eax, %ebx -; X86-NOSSE-NEXT: addl %ecx, %ebx -; X86-NOSSE-NEXT: movl %ebx, %ecx -; X86-NOSSE-NEXT: shrl $4, %ecx -; X86-NOSSE-NEXT: addl %ebx, %ecx -; X86-NOSSE-NEXT: andl %esi, %ecx +; X86-NOSSE-NEXT: addl %ecx, %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %ecx -; X86-NOSSE-NEXT: addl %edx, %ecx -; X86-NOSSE-NEXT: xorl %edx, %edx -; X86-NOSSE-NEXT: movl %edx, 12(%eax) -; X86-NOSSE-NEXT: movl %edx, 8(%eax) -; X86-NOSSE-NEXT: movl %edx, 4(%eax) -; X86-NOSSE-NEXT: movl %ecx, (%eax) +; X86-NOSSE-NEXT: addl %edi, %edx +; X86-NOSSE-NEXT: xorl %ecx, %ecx +; X86-NOSSE-NEXT: movl %ecx, 12(%eax) +; X86-NOSSE-NEXT: movl %ecx, 8(%eax) +; X86-NOSSE-NEXT: movl %ecx, 4(%eax) +; X86-NOSSE-NEXT: movl %edx, (%eax) ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi ; X86-NOSSE-NEXT: popl %ebx @@ -920,19 +924,21 @@ ; ; X86-POPCNT-LABEL: cnt128_optsize: ; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: pushl %esi ; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx ; X86-POPCNT-NEXT: addl %ecx, %edx ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X86-POPCNT-NEXT: addl %edx, %ecx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx -; X86-POPCNT-NEXT: addl %ecx, %edx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi +; X86-POPCNT-NEXT: addl %ecx, %esi +; X86-POPCNT-NEXT: addl %edx, %esi ; X86-POPCNT-NEXT: xorl %ecx, %ecx ; X86-POPCNT-NEXT: movl %ecx, 12(%eax) ; X86-POPCNT-NEXT: movl %ecx, 8(%eax) ; X86-POPCNT-NEXT: movl %ecx, 4(%eax) -; X86-POPCNT-NEXT: movl %edx, (%eax) +; X86-POPCNT-NEXT: movl %esi, (%eax) +; X86-POPCNT-NEXT: popl %esi ; X86-POPCNT-NEXT: retl $4 ; ; X64-POPCNT-LABEL: cnt128_optsize: @@ -1223,80 +1229,82 @@ ; X86-NOSSE-NEXT: pushl %esi ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NOSSE-NEXT: movl %ebx, %eax -; X86-NOSSE-NEXT: shrl %eax -; X86-NOSSE-NEXT: movl $1431655765, %ecx # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %ecx, %eax -; X86-NOSSE-NEXT: subl %eax, %ebx -; X86-NOSSE-NEXT: movl $858993459, %eax # imm = 0x33333333 -; X86-NOSSE-NEXT: movl %ebx, %edi -; X86-NOSSE-NEXT: andl %eax, %edi +; X86-NOSSE-NEXT: movl %ebx, %ecx +; X86-NOSSE-NEXT: shrl %ecx +; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %edi, %ecx +; X86-NOSSE-NEXT: subl %ecx, %ebx +; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 +; X86-NOSSE-NEXT: movl %ebx, %ebp +; X86-NOSSE-NEXT: andl %ecx, %ebp ; X86-NOSSE-NEXT: shrl $2, %ebx -; X86-NOSSE-NEXT: andl %eax, %ebx -; X86-NOSSE-NEXT: addl %edi, %ebx -; X86-NOSSE-NEXT: movl %ebx, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %ebx, %edi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: shrl %ebx ; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: subl %ebx, %esi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: andl %eax, %ebx -; X86-NOSSE-NEXT: shrl $2, %esi -; X86-NOSSE-NEXT: andl %eax, %esi -; X86-NOSSE-NEXT: addl %ebx, %esi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: shrl $4, %ebx -; X86-NOSSE-NEXT: addl %esi, %ebx -; X86-NOSSE-NEXT: movl $252645135, %esi # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: andl %esi, %edi -; X86-NOSSE-NEXT: imull $16843009, %edi, %ebp # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %ebp -; X86-NOSSE-NEXT: andl %esi, %ebx -; X86-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edi -; X86-NOSSE-NEXT: addl %ebp, %edi -; X86-NOSSE-NEXT: movl %edx, %ebx +; X86-NOSSE-NEXT: addl %ebp, %ebx +; X86-NOSSE-NEXT: movl %ebx, %ebp +; X86-NOSSE-NEXT: shrl $4, %ebp +; X86-NOSSE-NEXT: addl %ebx, %ebp +; X86-NOSSE-NEXT: movl %eax, %ebx ; X86-NOSSE-NEXT: shrl %ebx +; X86-NOSSE-NEXT: andl %edi, %ebx +; X86-NOSSE-NEXT: subl %ebx, %eax +; X86-NOSSE-NEXT: movl %eax, %ebx ; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: subl %ebx, %edx -; X86-NOSSE-NEXT: movl %edx, %ebx -; X86-NOSSE-NEXT: andl %eax, %ebx -; X86-NOSSE-NEXT: shrl $2, %edx -; X86-NOSSE-NEXT: andl %eax, %edx -; X86-NOSSE-NEXT: addl %ebx, %edx -; X86-NOSSE-NEXT: movl %edx, %ebp +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: addl %ebx, %eax +; X86-NOSSE-NEXT: movl %eax, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %eax, %edi +; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: andl %ebx, %ebp +; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: andl %ebx, %edi +; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edi +; X86-NOSSE-NEXT: addl %eax, %edi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %ebp, %eax +; X86-NOSSE-NEXT: subl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: addl %eax, %esi +; X86-NOSSE-NEXT: movl %esi, %ebp ; X86-NOSSE-NEXT: shrl $4, %ebp -; X86-NOSSE-NEXT: addl %edx, %ebp -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NOSSE-NEXT: andl %esi, %ebp -; X86-NOSSE-NEXT: imull $16843009, %ebp, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: addl %esi, %ebp +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %esi, %eax +; X86-NOSSE-NEXT: subl %eax, %edx +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: shrl $2, %edx +; X86-NOSSE-NEXT: andl %ecx, %edx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: movl %edx, %eax +; X86-NOSSE-NEXT: shrl $4, %eax +; X86-NOSSE-NEXT: addl %edx, %eax +; X86-NOSSE-NEXT: andl %ebx, %ebp +; X86-NOSSE-NEXT: andl %ebx, %eax +; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %ecx +; X86-NOSSE-NEXT: imull $16843009, %eax, %edx # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %edx -; X86-NOSSE-NEXT: addl %edi, %edx -; X86-NOSSE-NEXT: movl %ebx, %edi -; X86-NOSSE-NEXT: shrl %edi -; X86-NOSSE-NEXT: andl %ecx, %edi -; X86-NOSSE-NEXT: subl %edi, %ebx -; X86-NOSSE-NEXT: movl %ebx, %ecx -; X86-NOSSE-NEXT: andl %eax, %ecx -; X86-NOSSE-NEXT: shrl $2, %ebx -; X86-NOSSE-NEXT: andl %eax, %ebx -; X86-NOSSE-NEXT: addl %ecx, %ebx -; X86-NOSSE-NEXT: movl %ebx, %ecx -; X86-NOSSE-NEXT: shrl $4, %ecx -; X86-NOSSE-NEXT: addl %ebx, %ecx -; X86-NOSSE-NEXT: andl %esi, %ecx +; X86-NOSSE-NEXT: addl %ecx, %edx ; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %ecx -; X86-NOSSE-NEXT: addl %edx, %ecx -; X86-NOSSE-NEXT: xorl %edx, %edx -; X86-NOSSE-NEXT: movl %edx, 12(%eax) -; X86-NOSSE-NEXT: movl %edx, 8(%eax) -; X86-NOSSE-NEXT: movl %edx, 4(%eax) -; X86-NOSSE-NEXT: movl %ecx, (%eax) +; X86-NOSSE-NEXT: addl %edi, %edx +; X86-NOSSE-NEXT: xorl %ecx, %ecx +; X86-NOSSE-NEXT: movl %ecx, 12(%eax) +; X86-NOSSE-NEXT: movl %ecx, 8(%eax) +; X86-NOSSE-NEXT: movl %ecx, 4(%eax) +; X86-NOSSE-NEXT: movl %edx, (%eax) ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi ; X86-NOSSE-NEXT: popl %ebx @@ -1345,19 +1353,21 @@ ; ; X86-POPCNT-LABEL: cnt128_pgso: ; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: pushl %esi ; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx ; X86-POPCNT-NEXT: addl %ecx, %edx ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X86-POPCNT-NEXT: addl %edx, %ecx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx -; X86-POPCNT-NEXT: addl %ecx, %edx +; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi +; X86-POPCNT-NEXT: addl %ecx, %esi +; X86-POPCNT-NEXT: addl %edx, %esi ; X86-POPCNT-NEXT: xorl %ecx, %ecx ; X86-POPCNT-NEXT: movl %ecx, 12(%eax) ; X86-POPCNT-NEXT: movl %ecx, 8(%eax) ; X86-POPCNT-NEXT: movl %ecx, 4(%eax) -; X86-POPCNT-NEXT: movl %edx, (%eax) +; X86-POPCNT-NEXT: movl %esi, (%eax) +; X86-POPCNT-NEXT: popl %esi ; X86-POPCNT-NEXT: retl $4 ; ; X64-POPCNT-LABEL: cnt128_pgso: diff --git a/llvm/test/CodeGen/X86/pr34080-2.ll b/llvm/test/CodeGen/X86/pr34080-2.ll --- a/llvm/test/CodeGen/X86/pr34080-2.ll +++ b/llvm/test/CodeGen/X86/pr34080-2.ll @@ -31,6 +31,10 @@ ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: imull %edx ; CHECK-NEXT: movl %edx, %edi +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: sarl $7, %edi +; CHECK-NEXT: addl %eax, %edi ; CHECK-NEXT: imull $36525, %esi, %eax # imm = 0x8EAD ; CHECK-NEXT: addl $172251900, %eax # imm = 0xA445AFC ; CHECK-NEXT: movl $1374389535, %edx # imm = 0x51EB851F @@ -39,11 +43,7 @@ ; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: sarl $5, %edx ; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shrl $31, %eax ; CHECK-NEXT: addl 16(%ebx), %ecx -; CHECK-NEXT: addl %eax, %ecx -; CHECK-NEXT: sarl $7, %edi ; CHECK-NEXT: addl %edi, %ecx ; CHECK-NEXT: leal 257(%ecx,%edx), %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll --- a/llvm/test/CodeGen/X86/ptest.ll +++ b/llvm/test/CodeGen/X86/ptest.ll @@ -104,11 +104,11 @@ ; SSE2-LABEL: veccond512: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: je .LBB2_2 ; SSE2-NEXT: # %bb.1: # %if-true-block @@ -121,9 +121,9 @@ ; SSE41-LABEL: veccond512: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: je .LBB2_2 ; SSE41-NEXT: # %bb.1: # %if-true-block ; SSE41-NEXT: xorl %eax, %eax @@ -237,11 +237,11 @@ ; SSE2-LABEL: vectest512: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %ecx +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %ecx ; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; SSE2-NEXT: setne %al @@ -250,10 +250,10 @@ ; SSE41-LABEL: vectest512: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 ; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -349,11 +349,11 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: movl %edi, %eax ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %ecx +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %ecx ; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF ; SSE2-NEXT: cmovel %esi, %eax ; SSE2-NEXT: retq @@ -362,9 +362,9 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movl %edi, %eax ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: cmovel %esi, %eax ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/rev16.ll b/llvm/test/CodeGen/X86/rev16.ll --- a/llvm/test/CodeGen/X86/rev16.ll +++ b/llvm/test/CodeGen/X86/rev16.ll @@ -58,28 +58,29 @@ define i32 @extra_maskop_uses2(i32 %a) { ; X86-LABEL: extra_maskop_uses2: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shll $8, %edx -; X86-NEXT: shrl $8, %ecx -; X86-NEXT: andl $-16711936, %edx # imm = 0xFF00FF00 -; X86-NEXT: andl $16711935, %ecx # imm = 0xFF00FF -; X86-NEXT: leal (%ecx,%edx), %eax -; X86-NEXT: imull %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: shll $8, %ecx +; X86-NEXT: shrl $8, %eax +; X86-NEXT: andl $-16711936, %ecx # imm = 0xFF00FF00 +; X86-NEXT: andl $16711935, %eax # imm = 0xFF00FF +; X86-NEXT: leal (%eax,%ecx), %edx ; X86-NEXT: imull %ecx, %eax +; X86-NEXT: imull %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: extra_maskop_uses2: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: movl %edi, %ecx -; X64-NEXT: shll $8, %ecx +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll $8, %eax ; X64-NEXT: shrl $8, %edi -; X64-NEXT: andl $-16711936, %ecx # imm = 0xFF00FF00 +; X64-NEXT: andl $-16711936, %eax # imm = 0xFF00FF00 ; X64-NEXT: andl $16711935, %edi # imm = 0xFF00FF -; X64-NEXT: leal (%rdi,%rcx), %eax -; X64-NEXT: imull %ecx, %eax +; X64-NEXT: leal (%rdi,%rax), %ecx ; X64-NEXT: imull %edi, %eax +; X64-NEXT: imull %ecx, %eax +; X64-NEXT: # kill: def $eax killed $eax killed $rax ; X64-NEXT: retq %l8 = shl i32 %a, 8 %r8 = lshr i32 %a, 8 diff --git a/llvm/test/CodeGen/X86/rotate-multi.ll b/llvm/test/CodeGen/X86/rotate-multi.ll --- a/llvm/test/CodeGen/X86/rotate-multi.ll +++ b/llvm/test/CodeGen/X86/rotate-multi.ll @@ -27,8 +27,8 @@ ; CHECK: # %bb.0: # %b0 ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: shll $7, %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: roll $9, %edi -; CHECK-NEXT: orl %esi, %edi ; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: retq b0: @@ -52,10 +52,10 @@ ; CHECK-NEXT: shrl $21, %edi ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: shll $19, %eax +; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: shrl $13, %esi ; CHECK-NEXT: orl %edi, %esi ; CHECK-NEXT: orl %esi, %eax -; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retq %v0 = shl i32 %a0, 11 %v1 = lshr i32 %a0, 21 @@ -73,33 +73,33 @@ ; CHECK-LABEL: f3: ; CHECK: # %bb.0: # %b0 ; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: leal (,%rdi,8), %ecx -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shll $5, %eax -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: shll $7, %edx -; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: leal (,%rdi,8), %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: shll $5, %ecx +; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shll $13, %eax -; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: shll $7, %eax ; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: shll $19, %edx +; CHECK-NEXT: shll $13, %edx ; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: orl %ecx, %edx ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: shrl $2, %eax -; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: shrl $15, %edx -; CHECK-NEXT: orl %eax, %edx +; CHECK-NEXT: shll $19, %eax +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: shrl $2, %ecx +; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: movl %edi, %esi -; CHECK-NEXT: shrl $23, %esi +; CHECK-NEXT: shrl $15, %esi +; CHECK-NEXT: orl %ecx, %esi ; CHECK-NEXT: orl %edx, %esi +; CHECK-NEXT: movl %edi, %ecx +; CHECK-NEXT: shrl $23, %ecx ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: shrl $25, %eax -; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: shrl $30, %edi ; CHECK-NEXT: orl %edi, %eax -; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: orl %esi, %eax ; CHECK-NEXT: retq b0: %v0 = shl i32 %a0, 3 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -52,8 +52,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -152,31 +152,31 @@ ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 ; SSE2-NEXT: psadbw b+1024(%rax), %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: movdqa a+1040(%rax), %xmm3 ; SSE2-NEXT: psadbw b+1040(%rax), %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: addq $32, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; AVX1-LABEL: sad_32i8: @@ -205,8 +205,8 @@ ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -339,15 +339,15 @@ ; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: paddd %xmm5, %xmm3 ; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm5 +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] +; SSE2-NEXT: paddd %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -355,10 +355,10 @@ ; ; AVX1-LABEL: sad_avx64i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB2_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 @@ -370,32 +370,32 @@ ; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm5, %xmm5 ; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm6 ; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 ; AVX1-NEXT: vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-NEXT: addq $64, %rax ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm7 -; AVX1-NEXT: vpaddd %xmm1, %xmm1, %xmm8 -; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm8 -; AVX1-NEXT: vpaddd %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm7 +; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm8 +; AVX1-NEXT: vpaddd %xmm0, %xmm8, %xmm8 +; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpaddd %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm6, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -426,8 +426,8 @@ ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -878,11 +878,11 @@ ; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: movdqu 32(%rdi), %xmm1 ; SSE2-NEXT: psadbw %xmm2, %xmm1 +; SSE2-NEXT: paddq %xmm4, %xmm1 ; SSE2-NEXT: movdqu 48(%rdi), %xmm2 ; SSE2-NEXT: psadbw %xmm3, %xmm2 ; SSE2-NEXT: paddq %xmm0, %xmm2 ; SSE2-NEXT: paddq %xmm1, %xmm2 -; SSE2-NEXT: paddq %xmm4, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddq %xmm2, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax @@ -898,8 +898,8 @@ ; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw 32(%rdx), %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll --- a/llvm/test/CodeGen/X86/setcc-wide-types.ll +++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll @@ -253,6 +253,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: xorq %rsi, %rdx +; SSE2-NEXT: orq %r11, %rdx ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rsi ; SSE2-NEXT: xorq %rdi, %rsi @@ -261,18 +262,17 @@ ; SSE2-NEXT: xorq %r8, %rdi ; SSE2-NEXT: orq %rsi, %rdi ; SSE2-NEXT: orq %rdx, %rdi -; SSE2-NEXT: orq %r11, %rdi ; SSE2-NEXT: movq %xmm4, %rdx ; SSE2-NEXT: xorq %r9, %rdx ; SSE2-NEXT: movq %xmm6, %rsi ; SSE2-NEXT: xorq %r10, %rsi -; SSE2-NEXT: movq %xmm5, %r8 -; SSE2-NEXT: xorq %rcx, %r8 +; SSE2-NEXT: orq %rdx, %rsi +; SSE2-NEXT: movq %xmm5, %rdx +; SSE2-NEXT: xorq %rcx, %rdx ; SSE2-NEXT: movq %xmm7, %rcx ; SSE2-NEXT: xorq %rax, %rcx -; SSE2-NEXT: orq %r8, %rcx -; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: orq %rdi, %rcx ; SSE2-NEXT: setne %al @@ -280,38 +280,38 @@ ; ; SSE41-LABEL: ne_i512: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: movq %xmm2, %rcx -; SSE41-NEXT: movq %xmm1, %rdx -; SSE41-NEXT: movq %xmm3, %rsi -; SSE41-NEXT: pextrq $1, %xmm0, %rdi -; SSE41-NEXT: pextrq $1, %xmm2, %r8 -; SSE41-NEXT: pextrq $1, %xmm1, %r9 -; SSE41-NEXT: pextrq $1, %xmm3, %r10 +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: movq %xmm2, %rdx +; SSE41-NEXT: movq %xmm1, %rsi +; SSE41-NEXT: movq %xmm3, %rdi +; SSE41-NEXT: pextrq $1, %xmm0, %r8 +; SSE41-NEXT: pextrq $1, %xmm2, %r9 +; SSE41-NEXT: pextrq $1, %xmm1, %r10 +; SSE41-NEXT: pextrq $1, %xmm3, %rax ; SSE41-NEXT: movq %xmm4, %r11 -; SSE41-NEXT: xorq %rax, %r11 -; SSE41-NEXT: movq %xmm6, %rax -; SSE41-NEXT: xorq %rcx, %rax -; SSE41-NEXT: movq %xmm5, %rcx +; SSE41-NEXT: xorq %rcx, %r11 +; SSE41-NEXT: movq %xmm6, %rcx ; SSE41-NEXT: xorq %rdx, %rcx -; SSE41-NEXT: movq %xmm7, %rdx +; SSE41-NEXT: orq %r11, %rcx +; SSE41-NEXT: movq %xmm5, %rdx ; SSE41-NEXT: xorq %rsi, %rdx -; SSE41-NEXT: orq %rcx, %rdx -; SSE41-NEXT: orq %rax, %rdx -; SSE41-NEXT: orq %r11, %rdx -; SSE41-NEXT: pextrq $1, %xmm4, %rax -; SSE41-NEXT: xorq %rdi, %rax -; SSE41-NEXT: pextrq $1, %xmm6, %rcx +; SSE41-NEXT: movq %xmm7, %rsi +; SSE41-NEXT: xorq %rdi, %rsi +; SSE41-NEXT: orq %rdx, %rsi +; SSE41-NEXT: orq %rcx, %rsi +; SSE41-NEXT: pextrq $1, %xmm4, %rcx ; SSE41-NEXT: xorq %r8, %rcx -; SSE41-NEXT: pextrq $1, %xmm5, %rsi -; SSE41-NEXT: xorq %r9, %rsi +; SSE41-NEXT: pextrq $1, %xmm6, %rdx +; SSE41-NEXT: xorq %r9, %rdx +; SSE41-NEXT: orq %rcx, %rdx +; SSE41-NEXT: pextrq $1, %xmm5, %rcx +; SSE41-NEXT: xorq %r10, %rcx ; SSE41-NEXT: pextrq $1, %xmm7, %rdi -; SSE41-NEXT: xorq %r10, %rdi -; SSE41-NEXT: orq %rsi, %rdi +; SSE41-NEXT: xorq %rax, %rdi ; SSE41-NEXT: orq %rcx, %rdi -; SSE41-NEXT: orq %rax, %rdi -; SSE41-NEXT: xorl %eax, %eax ; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: orq %rsi, %rdi ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -331,6 +331,7 @@ ; AVX1-NEXT: xorq %rdx, %r11 ; AVX1-NEXT: vmovq %xmm3, %rdx ; AVX1-NEXT: xorq %rsi, %rdx +; AVX1-NEXT: orq %r11, %rdx ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rsi ; AVX1-NEXT: xorq %rdi, %rsi @@ -339,18 +340,17 @@ ; AVX1-NEXT: xorq %r8, %rdi ; AVX1-NEXT: orq %rsi, %rdi ; AVX1-NEXT: orq %rdx, %rdi -; AVX1-NEXT: orq %r11, %rdi ; AVX1-NEXT: vpextrq $1, %xmm2, %rdx ; AVX1-NEXT: xorq %r9, %rdx ; AVX1-NEXT: vpextrq $1, %xmm3, %rsi ; AVX1-NEXT: xorq %r10, %rsi -; AVX1-NEXT: vpextrq $1, %xmm0, %r8 -; AVX1-NEXT: xorq %rcx, %r8 +; AVX1-NEXT: orq %rdx, %rsi +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: xorq %rcx, %rdx ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx ; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: orq %r8, %rcx -; AVX1-NEXT: orq %rsi, %rcx ; AVX1-NEXT: orq %rdx, %rcx +; AVX1-NEXT: orq %rsi, %rcx ; AVX1-NEXT: xorl %eax, %eax ; AVX1-NEXT: orq %rdi, %rcx ; AVX1-NEXT: setne %al @@ -373,6 +373,7 @@ ; AVX2-NEXT: xorq %rdx, %r11 ; AVX2-NEXT: vmovq %xmm3, %rdx ; AVX2-NEXT: xorq %rsi, %rdx +; AVX2-NEXT: orq %r11, %rdx ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rsi ; AVX2-NEXT: xorq %rdi, %rsi @@ -381,18 +382,17 @@ ; AVX2-NEXT: xorq %r8, %rdi ; AVX2-NEXT: orq %rsi, %rdi ; AVX2-NEXT: orq %rdx, %rdi -; AVX2-NEXT: orq %r11, %rdi ; AVX2-NEXT: vpextrq $1, %xmm2, %rdx ; AVX2-NEXT: xorq %r9, %rdx ; AVX2-NEXT: vpextrq $1, %xmm3, %rsi ; AVX2-NEXT: xorq %r10, %rsi -; AVX2-NEXT: vpextrq $1, %xmm0, %r8 -; AVX2-NEXT: xorq %rcx, %r8 +; AVX2-NEXT: orq %rdx, %rsi +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: xorq %rcx, %rdx ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx ; AVX2-NEXT: xorq %rax, %rcx -; AVX2-NEXT: orq %r8, %rcx -; AVX2-NEXT: orq %rsi, %rcx ; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: orq %rsi, %rcx ; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: orq %rdi, %rcx ; AVX2-NEXT: setne %al @@ -444,6 +444,7 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rdx ; SSE2-NEXT: xorq %rsi, %rdx +; SSE2-NEXT: orq %r11, %rdx ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] ; SSE2-NEXT: movq %xmm0, %rsi ; SSE2-NEXT: xorq %rdi, %rsi @@ -452,18 +453,17 @@ ; SSE2-NEXT: xorq %r8, %rdi ; SSE2-NEXT: orq %rsi, %rdi ; SSE2-NEXT: orq %rdx, %rdi -; SSE2-NEXT: orq %r11, %rdi ; SSE2-NEXT: movq %xmm4, %rdx ; SSE2-NEXT: xorq %r9, %rdx ; SSE2-NEXT: movq %xmm6, %rsi ; SSE2-NEXT: xorq %r10, %rsi -; SSE2-NEXT: movq %xmm5, %r8 -; SSE2-NEXT: xorq %rcx, %r8 +; SSE2-NEXT: orq %rdx, %rsi +; SSE2-NEXT: movq %xmm5, %rdx +; SSE2-NEXT: xorq %rcx, %rdx ; SSE2-NEXT: movq %xmm7, %rcx ; SSE2-NEXT: xorq %rax, %rcx -; SSE2-NEXT: orq %r8, %rcx -; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: orq %rdx, %rcx +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: orq %rdi, %rcx ; SSE2-NEXT: sete %al @@ -471,38 +471,38 @@ ; ; SSE41-LABEL: eq_i512: ; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: movq %xmm2, %rcx -; SSE41-NEXT: movq %xmm1, %rdx -; SSE41-NEXT: movq %xmm3, %rsi -; SSE41-NEXT: pextrq $1, %xmm0, %rdi -; SSE41-NEXT: pextrq $1, %xmm2, %r8 -; SSE41-NEXT: pextrq $1, %xmm1, %r9 -; SSE41-NEXT: pextrq $1, %xmm3, %r10 +; SSE41-NEXT: movq %xmm0, %rcx +; SSE41-NEXT: movq %xmm2, %rdx +; SSE41-NEXT: movq %xmm1, %rsi +; SSE41-NEXT: movq %xmm3, %rdi +; SSE41-NEXT: pextrq $1, %xmm0, %r8 +; SSE41-NEXT: pextrq $1, %xmm2, %r9 +; SSE41-NEXT: pextrq $1, %xmm1, %r10 +; SSE41-NEXT: pextrq $1, %xmm3, %rax ; SSE41-NEXT: movq %xmm4, %r11 -; SSE41-NEXT: xorq %rax, %r11 -; SSE41-NEXT: movq %xmm6, %rax -; SSE41-NEXT: xorq %rcx, %rax -; SSE41-NEXT: movq %xmm5, %rcx +; SSE41-NEXT: xorq %rcx, %r11 +; SSE41-NEXT: movq %xmm6, %rcx ; SSE41-NEXT: xorq %rdx, %rcx -; SSE41-NEXT: movq %xmm7, %rdx +; SSE41-NEXT: orq %r11, %rcx +; SSE41-NEXT: movq %xmm5, %rdx ; SSE41-NEXT: xorq %rsi, %rdx -; SSE41-NEXT: orq %rcx, %rdx -; SSE41-NEXT: orq %rax, %rdx -; SSE41-NEXT: orq %r11, %rdx -; SSE41-NEXT: pextrq $1, %xmm4, %rax -; SSE41-NEXT: xorq %rdi, %rax -; SSE41-NEXT: pextrq $1, %xmm6, %rcx +; SSE41-NEXT: movq %xmm7, %rsi +; SSE41-NEXT: xorq %rdi, %rsi +; SSE41-NEXT: orq %rdx, %rsi +; SSE41-NEXT: orq %rcx, %rsi +; SSE41-NEXT: pextrq $1, %xmm4, %rcx ; SSE41-NEXT: xorq %r8, %rcx -; SSE41-NEXT: pextrq $1, %xmm5, %rsi -; SSE41-NEXT: xorq %r9, %rsi +; SSE41-NEXT: pextrq $1, %xmm6, %rdx +; SSE41-NEXT: xorq %r9, %rdx +; SSE41-NEXT: orq %rcx, %rdx +; SSE41-NEXT: pextrq $1, %xmm5, %rcx +; SSE41-NEXT: xorq %r10, %rcx ; SSE41-NEXT: pextrq $1, %xmm7, %rdi -; SSE41-NEXT: xorq %r10, %rdi -; SSE41-NEXT: orq %rsi, %rdi +; SSE41-NEXT: xorq %rax, %rdi ; SSE41-NEXT: orq %rcx, %rdi -; SSE41-NEXT: orq %rax, %rdi -; SSE41-NEXT: xorl %eax, %eax ; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: orq %rsi, %rdi ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -522,6 +522,7 @@ ; AVX1-NEXT: xorq %rdx, %r11 ; AVX1-NEXT: vmovq %xmm3, %rdx ; AVX1-NEXT: xorq %rsi, %rdx +; AVX1-NEXT: orq %r11, %rdx ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rsi ; AVX1-NEXT: xorq %rdi, %rsi @@ -530,18 +531,17 @@ ; AVX1-NEXT: xorq %r8, %rdi ; AVX1-NEXT: orq %rsi, %rdi ; AVX1-NEXT: orq %rdx, %rdi -; AVX1-NEXT: orq %r11, %rdi ; AVX1-NEXT: vpextrq $1, %xmm2, %rdx ; AVX1-NEXT: xorq %r9, %rdx ; AVX1-NEXT: vpextrq $1, %xmm3, %rsi ; AVX1-NEXT: xorq %r10, %rsi -; AVX1-NEXT: vpextrq $1, %xmm0, %r8 -; AVX1-NEXT: xorq %rcx, %r8 +; AVX1-NEXT: orq %rdx, %rsi +; AVX1-NEXT: vpextrq $1, %xmm0, %rdx +; AVX1-NEXT: xorq %rcx, %rdx ; AVX1-NEXT: vpextrq $1, %xmm1, %rcx ; AVX1-NEXT: xorq %rax, %rcx -; AVX1-NEXT: orq %r8, %rcx -; AVX1-NEXT: orq %rsi, %rcx ; AVX1-NEXT: orq %rdx, %rcx +; AVX1-NEXT: orq %rsi, %rcx ; AVX1-NEXT: xorl %eax, %eax ; AVX1-NEXT: orq %rdi, %rcx ; AVX1-NEXT: sete %al @@ -564,6 +564,7 @@ ; AVX2-NEXT: xorq %rdx, %r11 ; AVX2-NEXT: vmovq %xmm3, %rdx ; AVX2-NEXT: xorq %rsi, %rdx +; AVX2-NEXT: orq %r11, %rdx ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rsi ; AVX2-NEXT: xorq %rdi, %rsi @@ -572,18 +573,17 @@ ; AVX2-NEXT: xorq %r8, %rdi ; AVX2-NEXT: orq %rsi, %rdi ; AVX2-NEXT: orq %rdx, %rdi -; AVX2-NEXT: orq %r11, %rdi ; AVX2-NEXT: vpextrq $1, %xmm2, %rdx ; AVX2-NEXT: xorq %r9, %rdx ; AVX2-NEXT: vpextrq $1, %xmm3, %rsi ; AVX2-NEXT: xorq %r10, %rsi -; AVX2-NEXT: vpextrq $1, %xmm0, %r8 -; AVX2-NEXT: xorq %rcx, %r8 +; AVX2-NEXT: orq %rdx, %rsi +; AVX2-NEXT: vpextrq $1, %xmm0, %rdx +; AVX2-NEXT: xorq %rcx, %rdx ; AVX2-NEXT: vpextrq $1, %xmm1, %rcx ; AVX2-NEXT: xorq %rax, %rcx -; AVX2-NEXT: orq %r8, %rcx -; AVX2-NEXT: orq %rsi, %rcx ; AVX2-NEXT: orq %rdx, %rcx +; AVX2-NEXT: orq %rsi, %rcx ; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: orq %rdi, %rcx ; AVX2-NEXT: sete %al @@ -736,28 +736,28 @@ define i32 @ne_i256_pair(ptr %a, ptr %b) { ; SSE2-LABEL: ne_i256_pair: ; SSE2: # %bb.0: -; SSE2-NEXT: movq 16(%rdi), %rcx -; SSE2-NEXT: movq 24(%rdi), %rdx -; SSE2-NEXT: movq (%rdi), %rax +; SSE2-NEXT: movq 16(%rdi), %rax +; SSE2-NEXT: movq 24(%rdi), %rcx +; SSE2-NEXT: movq (%rdi), %rdx ; SSE2-NEXT: movq 8(%rdi), %r8 ; SSE2-NEXT: xorq 8(%rsi), %r8 -; SSE2-NEXT: xorq 24(%rsi), %rdx -; SSE2-NEXT: xorq (%rsi), %rax -; SSE2-NEXT: xorq 16(%rsi), %rcx +; SSE2-NEXT: xorq 24(%rsi), %rcx +; SSE2-NEXT: xorq (%rsi), %rdx +; SSE2-NEXT: xorq 16(%rsi), %rax ; SSE2-NEXT: movq 48(%rdi), %r9 ; SSE2-NEXT: movq 32(%rdi), %r10 ; SSE2-NEXT: movq 56(%rdi), %r11 ; SSE2-NEXT: movq 40(%rdi), %rdi ; SSE2-NEXT: xorq 40(%rsi), %rdi +; SSE2-NEXT: orq %r8, %rdi ; SSE2-NEXT: xorq 56(%rsi), %r11 -; SSE2-NEXT: orq %rdx, %r11 +; SSE2-NEXT: orq %rcx, %r11 ; SSE2-NEXT: orq %rdi, %r11 -; SSE2-NEXT: orq %r8, %r11 ; SSE2-NEXT: xorq 32(%rsi), %r10 +; SSE2-NEXT: orq %rdx, %r10 ; SSE2-NEXT: xorq 48(%rsi), %r9 -; SSE2-NEXT: orq %rcx, %r9 -; SSE2-NEXT: orq %r10, %r9 ; SSE2-NEXT: orq %rax, %r9 +; SSE2-NEXT: orq %r10, %r9 ; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: orq %r11, %r9 ; SSE2-NEXT: setne %al @@ -765,28 +765,28 @@ ; ; SSE41-LABEL: ne_i256_pair: ; SSE41: # %bb.0: -; SSE41-NEXT: movq 16(%rdi), %rcx -; SSE41-NEXT: movq 24(%rdi), %rdx -; SSE41-NEXT: movq (%rdi), %rax +; SSE41-NEXT: movq 16(%rdi), %rax +; SSE41-NEXT: movq 24(%rdi), %rcx +; SSE41-NEXT: movq (%rdi), %rdx ; SSE41-NEXT: movq 8(%rdi), %r8 ; SSE41-NEXT: xorq 8(%rsi), %r8 -; SSE41-NEXT: xorq 24(%rsi), %rdx -; SSE41-NEXT: xorq (%rsi), %rax -; SSE41-NEXT: xorq 16(%rsi), %rcx +; SSE41-NEXT: xorq 24(%rsi), %rcx +; SSE41-NEXT: xorq (%rsi), %rdx +; SSE41-NEXT: xorq 16(%rsi), %rax ; SSE41-NEXT: movq 48(%rdi), %r9 ; SSE41-NEXT: movq 32(%rdi), %r10 ; SSE41-NEXT: movq 56(%rdi), %r11 ; SSE41-NEXT: movq 40(%rdi), %rdi ; SSE41-NEXT: xorq 40(%rsi), %rdi +; SSE41-NEXT: orq %r8, %rdi ; SSE41-NEXT: xorq 56(%rsi), %r11 -; SSE41-NEXT: orq %rdx, %r11 +; SSE41-NEXT: orq %rcx, %r11 ; SSE41-NEXT: orq %rdi, %r11 -; SSE41-NEXT: orq %r8, %r11 ; SSE41-NEXT: xorq 32(%rsi), %r10 +; SSE41-NEXT: orq %rdx, %r10 ; SSE41-NEXT: xorq 48(%rsi), %r9 -; SSE41-NEXT: orq %rcx, %r9 -; SSE41-NEXT: orq %r10, %r9 ; SSE41-NEXT: orq %rax, %r9 +; SSE41-NEXT: orq %r10, %r9 ; SSE41-NEXT: xorl %eax, %eax ; SSE41-NEXT: orq %r11, %r9 ; SSE41-NEXT: setne %al @@ -850,28 +850,28 @@ define i32 @eq_i256_pair(ptr %a, ptr %b) { ; SSE2-LABEL: eq_i256_pair: ; SSE2: # %bb.0: -; SSE2-NEXT: movq 16(%rdi), %rcx -; SSE2-NEXT: movq 24(%rdi), %rdx -; SSE2-NEXT: movq (%rdi), %rax +; SSE2-NEXT: movq 16(%rdi), %rax +; SSE2-NEXT: movq 24(%rdi), %rcx +; SSE2-NEXT: movq (%rdi), %rdx ; SSE2-NEXT: movq 8(%rdi), %r8 ; SSE2-NEXT: xorq 8(%rsi), %r8 -; SSE2-NEXT: xorq 24(%rsi), %rdx -; SSE2-NEXT: xorq (%rsi), %rax -; SSE2-NEXT: xorq 16(%rsi), %rcx +; SSE2-NEXT: xorq 24(%rsi), %rcx +; SSE2-NEXT: xorq (%rsi), %rdx +; SSE2-NEXT: xorq 16(%rsi), %rax ; SSE2-NEXT: movq 48(%rdi), %r9 ; SSE2-NEXT: movq 32(%rdi), %r10 ; SSE2-NEXT: movq 56(%rdi), %r11 ; SSE2-NEXT: movq 40(%rdi), %rdi ; SSE2-NEXT: xorq 40(%rsi), %rdi +; SSE2-NEXT: orq %r8, %rdi ; SSE2-NEXT: xorq 56(%rsi), %r11 -; SSE2-NEXT: orq %rdx, %r11 +; SSE2-NEXT: orq %rcx, %r11 ; SSE2-NEXT: orq %rdi, %r11 -; SSE2-NEXT: orq %r8, %r11 ; SSE2-NEXT: xorq 32(%rsi), %r10 +; SSE2-NEXT: orq %rdx, %r10 ; SSE2-NEXT: xorq 48(%rsi), %r9 -; SSE2-NEXT: orq %rcx, %r9 -; SSE2-NEXT: orq %r10, %r9 ; SSE2-NEXT: orq %rax, %r9 +; SSE2-NEXT: orq %r10, %r9 ; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: orq %r11, %r9 ; SSE2-NEXT: sete %al @@ -879,28 +879,28 @@ ; ; SSE41-LABEL: eq_i256_pair: ; SSE41: # %bb.0: -; SSE41-NEXT: movq 16(%rdi), %rcx -; SSE41-NEXT: movq 24(%rdi), %rdx -; SSE41-NEXT: movq (%rdi), %rax +; SSE41-NEXT: movq 16(%rdi), %rax +; SSE41-NEXT: movq 24(%rdi), %rcx +; SSE41-NEXT: movq (%rdi), %rdx ; SSE41-NEXT: movq 8(%rdi), %r8 ; SSE41-NEXT: xorq 8(%rsi), %r8 -; SSE41-NEXT: xorq 24(%rsi), %rdx -; SSE41-NEXT: xorq (%rsi), %rax -; SSE41-NEXT: xorq 16(%rsi), %rcx +; SSE41-NEXT: xorq 24(%rsi), %rcx +; SSE41-NEXT: xorq (%rsi), %rdx +; SSE41-NEXT: xorq 16(%rsi), %rax ; SSE41-NEXT: movq 48(%rdi), %r9 ; SSE41-NEXT: movq 32(%rdi), %r10 ; SSE41-NEXT: movq 56(%rdi), %r11 ; SSE41-NEXT: movq 40(%rdi), %rdi ; SSE41-NEXT: xorq 40(%rsi), %rdi +; SSE41-NEXT: orq %r8, %rdi ; SSE41-NEXT: xorq 56(%rsi), %r11 -; SSE41-NEXT: orq %rdx, %r11 +; SSE41-NEXT: orq %rcx, %r11 ; SSE41-NEXT: orq %rdi, %r11 -; SSE41-NEXT: orq %r8, %r11 ; SSE41-NEXT: xorq 32(%rsi), %r10 +; SSE41-NEXT: orq %rdx, %r10 ; SSE41-NEXT: xorq 48(%rsi), %r9 -; SSE41-NEXT: orq %rcx, %r9 -; SSE41-NEXT: orq %r10, %r9 ; SSE41-NEXT: orq %rax, %r9 +; SSE41-NEXT: orq %r10, %r9 ; SSE41-NEXT: xorl %eax, %eax ; SSE41-NEXT: orq %r11, %r9 ; SSE41-NEXT: sete %al @@ -964,54 +964,54 @@ define i32 @ne_i512_pair(ptr %a, ptr %b) { ; NO512-LABEL: ne_i512_pair: ; NO512: # %bb.0: -; NO512-NEXT: movq 32(%rdi), %rax -; NO512-NEXT: movq 48(%rdi), %rcx -; NO512-NEXT: movq 40(%rdi), %rdx -; NO512-NEXT: movq 56(%rdi), %r8 -; NO512-NEXT: xorq 56(%rsi), %r8 -; NO512-NEXT: movq 120(%rdi), %r9 -; NO512-NEXT: xorq 120(%rsi), %r9 -; NO512-NEXT: orq %r8, %r9 +; NO512-NEXT: movq 40(%rdi), %rax +; NO512-NEXT: movq 56(%rdi), %rcx +; NO512-NEXT: movq 24(%rdi), %rdx +; NO512-NEXT: xorq 24(%rsi), %rdx +; NO512-NEXT: xorq 56(%rsi), %rcx ; NO512-NEXT: movq 88(%rdi), %r8 ; NO512-NEXT: xorq 88(%rsi), %r8 -; NO512-NEXT: orq %r8, %r9 -; NO512-NEXT: movq 24(%rdi), %r8 -; NO512-NEXT: xorq 24(%rsi), %r8 -; NO512-NEXT: xorq 40(%rsi), %rdx -; NO512-NEXT: orq %r8, %r9 -; NO512-NEXT: movq 104(%rdi), %r8 -; NO512-NEXT: xorq 104(%rsi), %r8 ; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 72(%rdi), %rdx -; NO512-NEXT: xorq 72(%rsi), %rdx +; NO512-NEXT: movq 120(%rdi), %rdx +; NO512-NEXT: xorq 120(%rsi), %rdx +; NO512-NEXT: orq %rcx, %rdx +; NO512-NEXT: movq 8(%rdi), %rcx +; NO512-NEXT: xorq 8(%rsi), %rcx +; NO512-NEXT: xorq 40(%rsi), %rax +; NO512-NEXT: orq %r8, %rdx +; NO512-NEXT: movq 72(%rdi), %r8 +; NO512-NEXT: xorq 72(%rsi), %r8 +; NO512-NEXT: orq %rcx, %r8 +; NO512-NEXT: movq 104(%rdi), %rcx +; NO512-NEXT: xorq 104(%rsi), %rcx +; NO512-NEXT: orq %rax, %rcx +; NO512-NEXT: movq 48(%rdi), %rax +; NO512-NEXT: orq %r8, %rcx +; NO512-NEXT: movq 16(%rdi), %r8 +; NO512-NEXT: xorq 16(%rsi), %r8 +; NO512-NEXT: xorq 48(%rsi), %rax +; NO512-NEXT: orq %rdx, %rcx +; NO512-NEXT: movq 80(%rdi), %rdx +; NO512-NEXT: xorq 80(%rsi), %rdx +; NO512-NEXT: orq %r8, %rdx +; NO512-NEXT: movq 112(%rdi), %r8 +; NO512-NEXT: xorq 112(%rsi), %r8 +; NO512-NEXT: orq %rax, %r8 +; NO512-NEXT: movq (%rdi), %rax +; NO512-NEXT: xorq (%rsi), %rax ; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 16(%rdi), %rdx -; NO512-NEXT: orq %r9, %r8 -; NO512-NEXT: movq 8(%rdi), %r9 -; NO512-NEXT: xorq 8(%rsi), %r9 -; NO512-NEXT: xorq 48(%rsi), %rcx -; NO512-NEXT: orq %r9, %r8 -; NO512-NEXT: movq 112(%rdi), %r9 -; NO512-NEXT: xorq 112(%rsi), %r9 -; NO512-NEXT: orq %rcx, %r9 -; NO512-NEXT: movq 80(%rdi), %rcx -; NO512-NEXT: xorq 80(%rsi), %rcx -; NO512-NEXT: orq %rcx, %r9 -; NO512-NEXT: movq (%rdi), %rcx -; NO512-NEXT: xorq 16(%rsi), %rdx -; NO512-NEXT: xorq (%rsi), %rcx -; NO512-NEXT: xorq 32(%rsi), %rax -; NO512-NEXT: orq %rdx, %r9 -; NO512-NEXT: movq 96(%rdi), %rdx -; NO512-NEXT: movq 64(%rdi), %rdi -; NO512-NEXT: xorq 64(%rsi), %rdi -; NO512-NEXT: xorq 96(%rsi), %rdx +; NO512-NEXT: movq 64(%rdi), %rdx +; NO512-NEXT: xorq 64(%rsi), %rdx ; NO512-NEXT: orq %rax, %rdx -; NO512-NEXT: orq %rdi, %rdx -; NO512-NEXT: orq %r9, %rdx -; NO512-NEXT: orq %rcx, %rdx +; NO512-NEXT: movq 32(%rdi), %rax +; NO512-NEXT: xorq 32(%rsi), %rax +; NO512-NEXT: movq 96(%rdi), %rdi +; NO512-NEXT: xorq 96(%rsi), %rdi +; NO512-NEXT: orq %rax, %rdi +; NO512-NEXT: orq %rdx, %rdi +; NO512-NEXT: orq %r8, %rdi ; NO512-NEXT: xorl %eax, %eax -; NO512-NEXT: orq %r8, %rdx +; NO512-NEXT: orq %rcx, %rdi ; NO512-NEXT: setne %al ; NO512-NEXT: retq ; @@ -1058,54 +1058,54 @@ define i32 @eq_i512_pair(ptr %a, ptr %b) { ; NO512-LABEL: eq_i512_pair: ; NO512: # %bb.0: -; NO512-NEXT: movq 32(%rdi), %rax -; NO512-NEXT: movq 48(%rdi), %rcx -; NO512-NEXT: movq 40(%rdi), %rdx -; NO512-NEXT: movq 56(%rdi), %r8 -; NO512-NEXT: xorq 56(%rsi), %r8 -; NO512-NEXT: movq 120(%rdi), %r9 -; NO512-NEXT: xorq 120(%rsi), %r9 -; NO512-NEXT: orq %r8, %r9 +; NO512-NEXT: movq 40(%rdi), %rax +; NO512-NEXT: movq 56(%rdi), %rcx +; NO512-NEXT: movq 24(%rdi), %rdx +; NO512-NEXT: xorq 24(%rsi), %rdx +; NO512-NEXT: xorq 56(%rsi), %rcx ; NO512-NEXT: movq 88(%rdi), %r8 ; NO512-NEXT: xorq 88(%rsi), %r8 -; NO512-NEXT: orq %r8, %r9 -; NO512-NEXT: movq 24(%rdi), %r8 -; NO512-NEXT: xorq 24(%rsi), %r8 -; NO512-NEXT: xorq 40(%rsi), %rdx -; NO512-NEXT: orq %r8, %r9 -; NO512-NEXT: movq 104(%rdi), %r8 -; NO512-NEXT: xorq 104(%rsi), %r8 ; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 72(%rdi), %rdx -; NO512-NEXT: xorq 72(%rsi), %rdx +; NO512-NEXT: movq 120(%rdi), %rdx +; NO512-NEXT: xorq 120(%rsi), %rdx +; NO512-NEXT: orq %rcx, %rdx +; NO512-NEXT: movq 8(%rdi), %rcx +; NO512-NEXT: xorq 8(%rsi), %rcx +; NO512-NEXT: xorq 40(%rsi), %rax +; NO512-NEXT: orq %r8, %rdx +; NO512-NEXT: movq 72(%rdi), %r8 +; NO512-NEXT: xorq 72(%rsi), %r8 +; NO512-NEXT: orq %rcx, %r8 +; NO512-NEXT: movq 104(%rdi), %rcx +; NO512-NEXT: xorq 104(%rsi), %rcx +; NO512-NEXT: orq %rax, %rcx +; NO512-NEXT: movq 48(%rdi), %rax +; NO512-NEXT: orq %r8, %rcx +; NO512-NEXT: movq 16(%rdi), %r8 +; NO512-NEXT: xorq 16(%rsi), %r8 +; NO512-NEXT: xorq 48(%rsi), %rax +; NO512-NEXT: orq %rdx, %rcx +; NO512-NEXT: movq 80(%rdi), %rdx +; NO512-NEXT: xorq 80(%rsi), %rdx +; NO512-NEXT: orq %r8, %rdx +; NO512-NEXT: movq 112(%rdi), %r8 +; NO512-NEXT: xorq 112(%rsi), %r8 +; NO512-NEXT: orq %rax, %r8 +; NO512-NEXT: movq (%rdi), %rax +; NO512-NEXT: xorq (%rsi), %rax ; NO512-NEXT: orq %rdx, %r8 -; NO512-NEXT: movq 16(%rdi), %rdx -; NO512-NEXT: orq %r9, %r8 -; NO512-NEXT: movq 8(%rdi), %r9 -; NO512-NEXT: xorq 8(%rsi), %r9 -; NO512-NEXT: xorq 48(%rsi), %rcx -; NO512-NEXT: orq %r9, %r8 -; NO512-NEXT: movq 112(%rdi), %r9 -; NO512-NEXT: xorq 112(%rsi), %r9 -; NO512-NEXT: orq %rcx, %r9 -; NO512-NEXT: movq 80(%rdi), %rcx -; NO512-NEXT: xorq 80(%rsi), %rcx -; NO512-NEXT: orq %rcx, %r9 -; NO512-NEXT: movq (%rdi), %rcx -; NO512-NEXT: xorq 16(%rsi), %rdx -; NO512-NEXT: xorq (%rsi), %rcx -; NO512-NEXT: xorq 32(%rsi), %rax -; NO512-NEXT: orq %rdx, %r9 -; NO512-NEXT: movq 96(%rdi), %rdx -; NO512-NEXT: movq 64(%rdi), %rdi -; NO512-NEXT: xorq 64(%rsi), %rdi -; NO512-NEXT: xorq 96(%rsi), %rdx +; NO512-NEXT: movq 64(%rdi), %rdx +; NO512-NEXT: xorq 64(%rsi), %rdx ; NO512-NEXT: orq %rax, %rdx -; NO512-NEXT: orq %rdi, %rdx -; NO512-NEXT: orq %r9, %rdx -; NO512-NEXT: orq %rcx, %rdx +; NO512-NEXT: movq 32(%rdi), %rax +; NO512-NEXT: xorq 32(%rsi), %rax +; NO512-NEXT: movq 96(%rdi), %rdi +; NO512-NEXT: xorq 96(%rsi), %rdi +; NO512-NEXT: orq %rax, %rdi +; NO512-NEXT: orq %rdx, %rdi +; NO512-NEXT: orq %r8, %rdi ; NO512-NEXT: xorl %eax, %eax -; NO512-NEXT: orq %r8, %rdx +; NO512-NEXT: orq %rcx, %rdi ; NO512-NEXT: sete %al ; NO512-NEXT: retq ; @@ -1184,16 +1184,16 @@ ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx ; ANY-NEXT: orq %r10, %rcx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 -; ANY-NEXT: orq %rcx, %r9 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi ; ANY-NEXT: orq %r9, %rsi +; ANY-NEXT: orq %rcx, %rsi ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx ; ANY-NEXT: orq %rax, %rdx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 -; ANY-NEXT: orq %rdx, %r8 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi ; ANY-NEXT: orq %r8, %rdi +; ANY-NEXT: orq %rdx, %rdi ; ANY-NEXT: orq %rsi, %rdi ; ANY-NEXT: sete %al ; ANY-NEXT: retq @@ -1252,18 +1252,18 @@ ; ANY-NEXT: adcq $0, %rax ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9 +; ANY-NEXT: orq %rsi, %r9 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax ; ANY-NEXT: orq %rcx, %rax ; ANY-NEXT: orq %r9, %rax -; ANY-NEXT: orq %rsi, %rax ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 ; ANY-NEXT: orq %rdx, %r10 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8 -; ANY-NEXT: orq %r10, %r8 ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi ; ANY-NEXT: orq %r8, %rdi +; ANY-NEXT: orq %r10, %rdi ; ANY-NEXT: orq %rax, %rdi ; ANY-NEXT: sete %al ; ANY-NEXT: retq @@ -1313,14 +1313,14 @@ ; ANY-NEXT: orq %r8, %r11 ; ANY-NEXT: xorq 8(%rdi), %rdx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax -; ANY-NEXT: orq %r11, %rax ; ANY-NEXT: orq %rdx, %rax +; ANY-NEXT: orq %r11, %rax ; ANY-NEXT: xorq 32(%rdi), %r9 ; ANY-NEXT: xorq (%rdi), %rsi +; ANY-NEXT: orq %r9, %rsi ; ANY-NEXT: xorq 16(%rdi), %rcx ; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10 ; ANY-NEXT: orq %rcx, %r10 -; ANY-NEXT: orq %r9, %r10 ; ANY-NEXT: orq %rsi, %r10 ; ANY-NEXT: orq %rax, %r10 ; ANY-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll --- a/llvm/test/CodeGen/X86/shift-combine.ll +++ b/llvm/test/CodeGen/X86/shift-combine.ll @@ -630,8 +630,8 @@ ; X64-NEXT: movl %edx, %eax ; X64-NEXT: shll $15, %edi ; X64-NEXT: shll $16, %eax +; X64-NEXT: orl %esi, %edi ; X64-NEXT: orl %ecx, %eax -; X64-NEXT: orl %esi, %eax ; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %a.shifted = shl i32 %a, 15 @@ -658,8 +658,8 @@ ; X64-NEXT: movl %edx, %eax ; X64-NEXT: shll $16, %edi ; X64-NEXT: shrl $16, %eax +; X64-NEXT: orl %esi, %edi ; X64-NEXT: orl %ecx, %eax -; X64-NEXT: orl %esi, %eax ; X64-NEXT: orl %edi, %eax ; X64-NEXT: retq %a.shifted = shl i32 %a, 16 @@ -777,8 +777,8 @@ ; X64: # %bb.0: ; X64-NEXT: pslld $16, %xmm0 ; X64-NEXT: pslld $17, %xmm2 +; X64-NEXT: por %xmm1, %xmm0 ; X64-NEXT: por %xmm3, %xmm2 -; X64-NEXT: por %xmm1, %xmm2 ; X64-NEXT: por %xmm2, %xmm0 ; X64-NEXT: retq %a.shifted = shl <4 x i32> %a, diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll --- a/llvm/test/CodeGen/X86/smul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll @@ -191,7 +191,7 @@ ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $192, %esp +; X86-NEXT: subl $188, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andl $1, %eax ; X86-NEXT: negl %eax @@ -229,7 +229,7 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -254,14 +254,14 @@ ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edi, %esi ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: setb %al ; X86-NEXT: addl %edx, %esi @@ -297,7 +297,7 @@ ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -329,7 +329,7 @@ ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -363,9 +363,9 @@ ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl (%esp), %edi # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: setb (%esp) # 1-byte Folded Spill +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx @@ -381,29 +381,31 @@ ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: setb %bl +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl %bl, %esi -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: movl %ebp, %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: addl %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: adcl %ecx, %esi -; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: adcl $0, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl (%esp), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -415,10 +417,10 @@ ; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -427,46 +429,45 @@ ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl %edi, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %eax, %esi ; X86-NEXT: setb %al -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: addl %ebp, %esi ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: addl %ebx, %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: addl %ebp, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: setb %al -; X86-NEXT: addl %edi, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl %al, %ebp ; X86-NEXT: adcl %edx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl %ebx, %edx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %esi, %edi ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload @@ -479,11 +480,11 @@ ; X86-NEXT: adcl %ebx, %ebp ; X86-NEXT: setb %cl ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: movzbl %cl, %ebx ; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl (%esp), %ebp # 4-byte Reload ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -505,7 +506,7 @@ ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload @@ -559,7 +560,7 @@ ; X86-NEXT: adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %ebp # 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebx @@ -582,7 +583,7 @@ ; X86-NEXT: addl %esi, %edx ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload @@ -601,10 +602,10 @@ ; X86-NEXT: movl %ebp, %edi ; X86-NEXT: adcl %ecx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -612,8 +613,8 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %edi, %esi -; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: addl %ebx, %esi +; X86-NEXT: adcl %edi, %ecx ; X86-NEXT: setb %bl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: addl %eax, %esi @@ -621,7 +622,7 @@ ; X86-NEXT: movzbl %bl, %edi ; X86-NEXT: adcl %edx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload @@ -655,7 +656,7 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax @@ -674,125 +675,132 @@ ; X86-NEXT: adcl %ebx, %edi ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: adcl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl %esi, %eax +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl %ebp, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: addl %edx, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %eax, (%esp) # 4-byte Folded Spill ; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: movl %ebp, %edi ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: setb %al -; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl %al, %ebp -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %ebp, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: setb %al -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl %edx, %ebx -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %eax, %esi +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: adcl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl %edx, %esi -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: adcl %edi, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: addl %edx, %eax -; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movl %eax, %edi +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl %ebx, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl %ebx, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: xorl %edx, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: xorl %edx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: xorl %edx, %ebp ; X86-NEXT: xorl %edx, %esi -; X86-NEXT: orl %ebp, %esi +; X86-NEXT: xorl %edx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: xorl %edx, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: xorl %edx, %eax +; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: xorl %edx, %ebp +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: movl %edi, %esi +; X86-NEXT: xorl %edx, %esi ; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %ebp, %edx ; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-NEXT: movl %edi, %ecx ; X86-NEXT: andl $1, %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: negl %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: xorl %eax, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: xorl %eax, %esi ; X86-NEXT: orl %ebx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: xorl %eax, %ebx -; X86-NEXT: orl %esi, %ebx ; X86-NEXT: xorl %edi, %eax ; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl %esi, %eax ; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload @@ -805,7 +813,7 @@ ; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: movb %cl, 16(%eax) ; X86-NEXT: setne 20(%eax) -; X86-NEXT: addl $192, %esp +; X86-NEXT: addl $188, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -973,9 +981,9 @@ ; X64-NEXT: xorq %rcx, %r10 ; X64-NEXT: orq %rdx, %r10 ; X64-NEXT: xorq %rcx, %rax -; X64-NEXT: orq %r10, %rax ; X64-NEXT: xorq %rbx, %rcx ; X64-NEXT: orq %rax, %rcx +; X64-NEXT: orq %r10, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; X64-NEXT: movl %eax, %esi ; X64-NEXT: andl $1, %esi diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -231,8 +231,8 @@ ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi ; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %ecx, %esi ; X86-NEXT: addl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -376,65 +376,66 @@ ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: imull %edx, %ecx +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: imull %edx, %ebx ; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl %esi, %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull %ebx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp ; X86-NEXT: addl %eax, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ecx, %esi +; X86-NEXT: addl %ebx, %esi ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ebp, %ebx +; X86-NEXT: addl %eax, %ebx +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: adcl $0, %ebp +; X86-NEXT: adcl %esi, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload -; X86-NEXT: adcl %edi, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload ; X86-NEXT: adcl %esi, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: movl %ebp, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl %ecx, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: xorl %esi, %edx -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: xorl %ebp, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: xorl $2147483647, %edi # imm = 0x7FFFFFFF -; X86-NEXT: orl %edx, %esi -; X86-NEXT: notl %ebx -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: cmovel %ecx, %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %edi, %edx +; X86-NEXT: xorl $2147483647, %esi # imm = 0x7FFFFFFF +; X86-NEXT: orl %edx, %edi +; X86-NEXT: notl %ecx +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovel %ebp, %esi +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %esi, %edx ; X86-NEXT: addl $12, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -14,56 +14,58 @@ ; X64-NEXT: .cfi_offset %rbx, -32 ; X64-NEXT: .cfi_offset %r14, -24 ; X64-NEXT: .cfi_offset %r15, -16 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rdi, %r10 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rdi, %r11 ; X64-NEXT: movq %rsi, %rdi ; X64-NEXT: sarq $63, %rdi -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: imulq %rdi, %rbx +; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: imulq %rdi, %r9 ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %rax, %rbx -; X64-NEXT: addq %rdx, %rbx +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rax, %rdi +; X64-NEXT: addq %r9, %rdi ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: sarq $63, %rax ; X64-NEXT: movq %rax, %r15 ; X64-NEXT: imulq %rsi, %r15 -; X64-NEXT: mulq %r10 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rax, %r15 -; X64-NEXT: addq %rdx, %r15 -; X64-NEXT: addq %rdi, %r9 -; X64-NEXT: adcq %rbx, %r15 -; X64-NEXT: movq %r10, %rax ; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: addq %r15, %r9 +; X64-NEXT: addq %rax, %r9 +; X64-NEXT: addq %r14, %r10 +; X64-NEXT: adcq %rdi, %r9 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rbx +; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq %rsi, %rax -; X64-NEXT: mulq %r11 -; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: addq %rbx, %r14 -; X64-NEXT: adcq $0, %r11 -; X64-NEXT: movq %r10, %rax -; X64-NEXT: mulq %rcx +; X64-NEXT: mulq %rbx ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r14, %r10 -; X64-NEXT: adcq %r11, %rbx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r14, %r15 +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %r15, %r11 +; X64-NEXT: adcq %rbx, %r14 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %r11d +; X64-NEXT: movzbl %al, %ebx ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: addq %rbx, %rax -; X64-NEXT: adcq %r11, %rdx -; X64-NEXT: addq %r9, %rax -; X64-NEXT: adcq %r15, %rdx -; X64-NEXT: movq %r10, 8(%r8) -; X64-NEXT: sarq $63, %r10 -; X64-NEXT: xorq %r10, %rdx -; X64-NEXT: xorq %rax, %r10 -; X64-NEXT: orq %rdx, %r10 +; X64-NEXT: addq %r14, %rax +; X64-NEXT: adcq %rbx, %rdx +; X64-NEXT: addq %r10, %rax +; X64-NEXT: adcq %r9, %rdx +; X64-NEXT: movq %r11, 8(%r8) +; X64-NEXT: sarq $63, %r11 +; X64-NEXT: xorq %r11, %rdx +; X64-NEXT: xorq %rax, %r11 +; X64-NEXT: orq %rdx, %r11 ; X64-NEXT: setne %al ; X64-NEXT: movq %rdi, (%r8) ; X64-NEXT: popq %rbx @@ -88,217 +90,218 @@ ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill +; X86-NEXT: mull %edi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebx +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebx, %esi +; X86-NEXT: addl %edi, %esi ; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edi +; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %ecx +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: adcl %ebp, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebp +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl (%esp), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: setb (%esp) ## 1-byte Folded Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: mull %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload -; X86-NEXT: adcl %esi, %edx -; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: sarl $31, %edi +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: sarl $31, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movl %ebp, (%esp) ## 4-byte Spill -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %ebx -; X86-NEXT: setb %cl -; X86-NEXT: movl %edi, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %edi, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %edi, %esi +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %eax, %esi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: imull %eax, %edi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %eax, %esi -; X86-NEXT: addl %edx, %esi -; X86-NEXT: addl %ebp, %eax -; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movzbl %cl, %ecx +; X86-NEXT: addl %edi, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: addl %eax, %esi +; X86-NEXT: adcl %edx, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %edx, %edi -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %eax, %edi -; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: addl %eax, %ebp +; X86-NEXT: adcl %edx, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %esi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: imull %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl %esi, %eax -; X86-NEXT: addl %edx, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: adcl %edi, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: mull %ecx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: adcl %ebp, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: addl %ecx, %edi +; X86-NEXT: adcl %edx, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sarl $31, %eax -; X86-NEXT: xorl %eax, %edx -; X86-NEXT: xorl %eax, %edi -; X86-NEXT: orl %edx, %edi -; X86-NEXT: xorl %eax, %ebp -; X86-NEXT: xorl %esi, %eax -; X86-NEXT: orl %ebp, %eax -; X86-NEXT: orl %edi, %eax +; X86-NEXT: adcl %esi, %edi +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: xorl %ecx, %ebp +; X86-NEXT: orl %eax, %ebp +; X86-NEXT: xorl %ecx, %edi +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: orl %ebp, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 12(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload @@ -340,36 +343,36 @@ ; X64-NEXT: .cfi_offset %r14, -32 ; X64-NEXT: .cfi_offset %r15, -24 ; X64-NEXT: .cfi_offset %rbp, -16 -; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: movq %rcx, %r13 ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rsi, %r11 ; X64-NEXT: movq %rdx, %rax ; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: mulq %r8 +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rsi, %rbx -; X64-NEXT: adcq $0, %rcx +; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: addq %rbx, %r14 -; X64-NEXT: adcq %rcx, %r12 +; X64-NEXT: adcq %rsi, %r12 ; X64-NEXT: setb %al -; X64-NEXT: movzbl %al, %ecx -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movzbl %al, %r10d +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %r9 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r12, %rsi -; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: adcq %r10, %rcx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill @@ -386,168 +389,171 @@ ; X64-NEXT: addq %r13, %rax ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: adcq %r12, %rbx -; X64-NEXT: setb %r8b +; X64-NEXT: setb %r10b ; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rbx, %rbp -; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: movzbl %r10b, %eax ; X64-NEXT: adcq %rax, %r13 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 -; X64-NEXT: addq %r10, %rbp +; X64-NEXT: addq %r15, %rbp ; X64-NEXT: adcq %r14, %r13 ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: adcq $0, %rcx -; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdi, %r14 +; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r14 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r15 ; X64-NEXT: movq %r11, %rbx +; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r9 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %r8, %r9 -; X64-NEXT: adcq $0, %r10 +; X64-NEXT: movq %rdx, %r9 +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rdi, %r10 +; X64-NEXT: adcq $0, %r9 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: addq %r9, %rax -; X64-NEXT: movq %rax, %rdi -; X64-NEXT: adcq %r10, %r11 -; X64-NEXT: setb %cl +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %r10, %r14 +; X64-NEXT: adcq %r9, %r11 +; X64-NEXT: setb %r10b ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %r10 -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %r11, %r8 -; X64-NEXT: movzbl %cl, %eax -; X64-NEXT: adcq %rax, %r10 -; X64-NEXT: addq %rbp, %r14 +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %r11, %r9 +; X64-NEXT: movzbl %r10b, %eax +; X64-NEXT: adcq %rax, %rdi +; X64-NEXT: addq %rbp, %r15 +; X64-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r13, %r14 ; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r13, %rdi -; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq $0, %r8 -; X64-NEXT: adcq $0, %r10 -; X64-NEXT: addq %rsi, %r8 -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Folded Reload -; X64-NEXT: setb %cl +; X64-NEXT: adcq $0, %r9 +; X64-NEXT: adcq $0, %rdi +; X64-NEXT: addq %rsi, %r9 +; X64-NEXT: adcq %rcx, %rdi +; X64-NEXT: setb %bl ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 ## 8-byte Reload ; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; X64-NEXT: mulq %rdi +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload +; X64-NEXT: movq %r14, %rax +; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r14 -; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rsi, %r9 -; X64-NEXT: adcq $0, %rdi +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: addq %rcx, %r10 +; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: addq %r9, %rax -; X64-NEXT: movq %rax, %r9 -; X64-NEXT: adcq %rdi, %r11 -; X64-NEXT: setb %sil -; X64-NEXT: movq %r15, %rax +; X64-NEXT: addq %r10, %rax +; X64-NEXT: movq %rax, %r10 +; X64-NEXT: adcq %rsi, %r11 +; X64-NEXT: setb %cl +; X64-NEXT: movq %r14, %rax ; X64-NEXT: mulq %r12 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %r13 -; X64-NEXT: addq %r11, %r13 -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: adcq %rax, %rbp -; X64-NEXT: addq %r8, %r14 -; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r10, %r9 -; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %r11, %r15 ; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: adcq %rax, %r13 -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %r15, %r8 -; X64-NEXT: sarq $63, %r8 -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: mulq %r8 +; X64-NEXT: addq %r9, %rbp +; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %rdi, %r10 +; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: movzbl %bl, %eax +; X64-NEXT: adcq %rax, %r15 +; X64-NEXT: adcq $0, %r13 +; X64-NEXT: movq %r14, %rdi +; X64-NEXT: movq %r14, %rbp +; X64-NEXT: sarq $63, %rdi +; X64-NEXT: movq %r8, %rax +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r9 -; X64-NEXT: movq %rax, %rsi +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: movq %rax, %r10 -; X64-NEXT: addq %r9, %r10 +; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: adcq $0, %r14 -; X64-NEXT: addq %rsi, %r10 -; X64-NEXT: movq %rsi, %rdi -; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; X64-NEXT: adcq %r9, %r14 -; X64-NEXT: setb %sil -; X64-NEXT: movq %r8, %r9 -; X64-NEXT: imulq %r12, %r9 -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: movq %rax, %r8 +; X64-NEXT: addq %r9, %r8 +; X64-NEXT: movq %rdx, %r10 +; X64-NEXT: adcq $0, %r10 +; X64-NEXT: addq %rcx, %r8 +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; X64-NEXT: adcq %r9, %r10 +; X64-NEXT: setb %cl +; X64-NEXT: movq %rdi, %rsi +; X64-NEXT: imulq %r12, %rsi +; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq {{[0-9]+}}(%rsp) -; X64-NEXT: addq %rax, %r9 -; X64-NEXT: addq %rdx, %r9 -; X64-NEXT: addq %rdi, %rax -; X64-NEXT: adcq %r10, %r9 -; X64-NEXT: addq %r11, %r14 -; X64-NEXT: movzbl %sil, %edi -; X64-NEXT: adcq %rcx, %rdi -; X64-NEXT: addq %rax, %r14 -; X64-NEXT: adcq %r9, %rdi +; X64-NEXT: addq %rsi, %rdx +; X64-NEXT: addq %rax, %rdx +; X64-NEXT: addq %rbx, %rax +; X64-NEXT: adcq %r8, %rdx +; X64-NEXT: addq %r11, %r10 +; X64-NEXT: movzbl %cl, %esi +; X64-NEXT: adcq %r14, %rsi +; X64-NEXT: addq %rax, %r10 +; X64-NEXT: adcq %rdx, %rsi ; X64-NEXT: sarq $63, %r12 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, %rsi ; X64-NEXT: movq %rax, %rcx -; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movq %rax, %r14 +; X64-NEXT: addq %rdx, %r14 ; X64-NEXT: adcq $0, %r11 ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %rbx -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: mulq {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Folded Reload +; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rax, %rcx +; X64-NEXT: addq %rax, %r14 ; X64-NEXT: adcq %rdx, %r11 ; X64-NEXT: setb %bl -; X64-NEXT: imulq %r12, %r15 +; X64-NEXT: imulq %r12, %rbp ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload ; X64-NEXT: mulq %r12 -; X64-NEXT: addq %rax, %r15 -; X64-NEXT: addq %rdx, %r15 -; X64-NEXT: addq %rsi, %rax -; X64-NEXT: adcq %rcx, %r15 +; X64-NEXT: addq %rax, %rdx +; X64-NEXT: addq %rbp, %rdx +; X64-NEXT: addq %rcx, %rax +; X64-NEXT: adcq %r14, %rdx ; X64-NEXT: addq %r9, %r11 -; X64-NEXT: movzbl %bl, %edx -; X64-NEXT: adcq %r8, %rdx +; X64-NEXT: movzbl %bl, %r9d +; X64-NEXT: adcq %rdi, %r9 ; X64-NEXT: addq %rax, %r11 -; X64-NEXT: adcq %r15, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload -; X64-NEXT: adcq %r10, %rcx -; X64-NEXT: adcq %r14, %r11 -; X64-NEXT: adcq %rdi, %rdx -; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Folded Reload -; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload -; X64-NEXT: adcq %r13, %r11 -; X64-NEXT: adcq %rbp, %rdx -; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload -; X64-NEXT: movq %rdi, %rax +; X64-NEXT: adcq %rdx, %r9 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload +; X64-NEXT: adcq %r8, %r14 +; X64-NEXT: adcq %r10, %r11 +; X64-NEXT: adcq %rsi, %r9 +; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload +; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload +; X64-NEXT: adcq %r15, %r11 +; X64-NEXT: adcq %r13, %r9 +; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload +; X64-NEXT: movq %rdx, %rax ; X64-NEXT: sarq $63, %rax -; X64-NEXT: xorq %rax, %rdx -; X64-NEXT: xorq %rax, %rcx -; X64-NEXT: orq %rdx, %rcx +; X64-NEXT: xorq %rax, %r9 +; X64-NEXT: xorq %rax, %r14 +; X64-NEXT: orq %r9, %r14 ; X64-NEXT: xorq %rax, %r11 -; X64-NEXT: xorq %rsi, %rax +; X64-NEXT: xorq %rcx, %rax ; X64-NEXT: orq %r11, %rax -; X64-NEXT: orq %rcx, %rax +; X64-NEXT: orq %r14, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax -; X64-NEXT: movq %rdi, 24(%rax) +; X64-NEXT: movq %rdx, 24(%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload ; X64-NEXT: movq %rcx, (%rax) ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload @@ -579,59 +585,60 @@ ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %ecx, %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb %bl -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebp ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %ebp, %ecx -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %esi, %eax +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %ebp, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload @@ -640,65 +647,65 @@ ; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %edi, %ebx ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: adcl %ebp, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: movl %eax, %edi +; X86-NEXT: addl %esi, %edi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebp ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: addl (%esp), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: setb (%esp) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill +; X86-NEXT: addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload @@ -706,28 +713,29 @@ ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %edi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movzbl %bl, %eax @@ -741,23 +749,23 @@ ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %bl ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload @@ -766,57 +774,57 @@ ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl %edi, %ebx ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl %ebp, %edi +; X86-NEXT: adcl %ebp, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %edi, %ebp +; X86-NEXT: addl %esi, %ebp ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %esi +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: adcl $0, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %esi ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) @@ -826,15 +834,15 @@ ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl (%esp), %ecx ## 4-byte Reload ; X86-NEXT: addl %ebp, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: adcl %esi, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebx ; X86-NEXT: adcl $0, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, (%esp) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload @@ -858,111 +866,111 @@ ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx -; X86-NEXT: setb %bl +; X86-NEXT: adcl %esi, %ebx +; X86-NEXT: setb %cl ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %esi, %ebp +; X86-NEXT: adcl $0, %ebx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ebp -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebp, %esi -; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %ebp, %ecx +; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebp ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: adcl $0, %ebp -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: mull %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl %esi, %eax +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl %esi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload -; X86-NEXT: addl %ebx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: adcl %ebp, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %eax @@ -978,72 +986,71 @@ ; X86-NEXT: adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: movl %edi, %edx ; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: adcl $0, %ecx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: adcl $0, %eax ; X86-NEXT: adcl $0, %esi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb %bl ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %esi +; X86-NEXT: mull %edi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edi, %esi +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb %bl ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx +; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movzbl %bl, %eax ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload @@ -1051,72 +1058,71 @@ ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, (%esp) ## 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp +; X86-NEXT: addl %edi, %ebp ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %ebx, %edi +; X86-NEXT: adcl %ebx, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %edi, %esi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %esi, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: adcl %eax, %edi ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl (%esp), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl (%esp), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %ecx, %ebp -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp -; X86-NEXT: adcl %edi, %ecx +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb (%esp) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ecx, %eax -; X86-NEXT: movzbl (%esp), %ecx ## 1-byte Folded Reload -; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload -; X86-NEXT: adcl %esi, %eax ; X86-NEXT: movl %eax, %esi +; X86-NEXT: addl %ecx, %esi +; X86-NEXT: movzbl (%esp), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload +; X86-NEXT: adcl %eax, %esi ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill @@ -1135,57 +1141,56 @@ ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: mull %esi +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %esi, %ebx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: mull %esi +; X86-NEXT: movl %esi, %ebp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: adcl $0, %ebx -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ecx, %ebx ; X86-NEXT: setb %al -; X86-NEXT: addl %edi, %ebx +; X86-NEXT: addl %esi, %ebx ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: mull %ebp ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %esi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, %edx -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl %edi, (%esp) ## 4-byte Spill ; X86-NEXT: addl %edi, %edx -; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %ecx, %edi ; X86-NEXT: adcl $0, %edi -; X86-NEXT: addl %ecx, %edx +; X86-NEXT: addl %ebp, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %eax, %edi ; X86-NEXT: setb %al -; X86-NEXT: addl %ebp, %edi +; X86-NEXT: addl %esi, %edi ; X86-NEXT: movzbl %al, %edx -; X86-NEXT: adcl %esi, %edx +; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %edi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload @@ -1214,11 +1219,12 @@ ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %edx, %ebp ; X86-NEXT: adcl $0, %esi ; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill @@ -1226,25 +1232,25 @@ ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl %edx, %esi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: imull %ecx, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: imull %ebx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: addl %edx, %ebx +; X86-NEXT: mull %ebx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: adcl %ebp, %ebx +; X86-NEXT: adcl %ebp, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload ; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl %ebx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: movl %ebx, %edx ; X86-NEXT: movl (%esp), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %ebx +; X86-NEXT: addl %eax, %edx ; X86-NEXT: adcl $0, %eax -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, (%esp) ## 4-byte Spill ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill @@ -1252,210 +1258,213 @@ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: adcl %ebx, %eax +; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: adcl %edx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl (%esp), %ebx ## 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: movl (%esp), %edx ## 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %ebx, %edx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: adcl %esi, %edx +; X86-NEXT: movl %ebx, %esi ; X86-NEXT: adcl %ebp, %esi ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, (%esp) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload +; X86-NEXT: movl %edx, (%esp) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, %edi ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, %edi +; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %esi, %eax +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %edx, %edi +; X86-NEXT: adcl %edx, %esi ; X86-NEXT: setb %bl -; X86-NEXT: addl %eax, %edi +; X86-NEXT: addl %eax, %esi ; X86-NEXT: movzbl %bl, %ebx ; X86-NEXT: adcl %edx, %ebx ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: addl %edi, %eax +; X86-NEXT: addl %esi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: adcl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl $0, %edi +; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ebx -; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %edx, %ebx -; X86-NEXT: adcl $0, %ebp -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edx, %edi +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: addl %eax, %ebx -; X86-NEXT: adcl %edx, %ebp -; X86-NEXT: setb %al -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl %edx, %edi +; X86-NEXT: setb %cl +; X86-NEXT: addl %eax, %edi +; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; X86-NEXT: adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill -; X86-NEXT: movl %ebp, %edx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edi, %ecx ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: setb %dl -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ebx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movzbl %dl, %ecx -; X86-NEXT: adcl %ebp, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl $0, %eax +; X86-NEXT: addl %esi, %ecx +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, %esi +; X86-NEXT: setb %al +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl $0, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload +; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload +; X86-NEXT: addl %eax, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload -; X86-NEXT: movl %ebp, %edi -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl %ecx, %edi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: adcl %edx, %esi -; X86-NEXT: addl %ebx, %edi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload -; X86-NEXT: adcl %ebp, %eax +; X86-NEXT: adcl %eax, %ebx +; X86-NEXT: setb %al +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload +; X86-NEXT: addl %edi, %ebp ; X86-NEXT: addl %ecx, %edi -; X86-NEXT: adcl %esi, %eax +; X86-NEXT: adcl %edx, %ebp +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl %ebp, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: mull %ebp +; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl %ebx, %ecx +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl %esi, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: imull %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: imull %ebp, %esi +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %eax, %ebx -; X86-NEXT: addl %edx, %ebx -; X86-NEXT: addl %esi, %eax -; X86-NEXT: adcl %ebp, %ebx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: addl %eax, %ecx +; X86-NEXT: addl %esi, %edx +; X86-NEXT: addl %eax, %edx +; X86-NEXT: addl %ecx, %eax ; X86-NEXT: adcl %ebx, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 1-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: addl %eax, %edi +; X86-NEXT: adcl %edx, %esi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload -; X86-NEXT: addl %eax, %esi -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl %edi, %ecx -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload +; X86-NEXT: addl %eax, %ecx ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ecx ## 4-byte Folded Reload +; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: sarl $31, %eax +; X86-NEXT: xorl %eax, %edx ; X86-NEXT: xorl %eax, %edi +; X86-NEXT: orl %edx, %edi ; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: orl %edi, %ecx -; X86-NEXT: xorl %eax, %esi -; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: orl %edi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: xorl %eax, %ecx -; X86-NEXT: orl %esi, %ecx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload ; X86-NEXT: xorl %eax, %esi -; X86-NEXT: xorl %eax, %edx -; X86-NEXT: orl %esi, %edx -; X86-NEXT: xorl %eax, %ebp -; X86-NEXT: orl %edx, %ebp +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: xorl %eax, %ebx ; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload -; X86-NEXT: orl %ebp, %eax -; X86-NEXT: orl %ecx, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: orl %edx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, 28(%eax) +; X86-NEXT: movl %ebp, 28(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll --- a/llvm/test/CodeGen/X86/sse-regcall.ll +++ b/llvm/test/CodeGen/X86/sse-regcall.ll @@ -196,7 +196,7 @@ ; WIN32: # %bb.0: ; WIN32-NEXT: pushl %ebp ; WIN32-NEXT: pushl %ebx -; WIN32-NEXT: subl $16, %esp +; WIN32-NEXT: subl $12, %esp ; WIN32-NEXT: movl %esi, (%esp) # 4-byte Spill ; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: movl %edx, %ebx @@ -207,37 +207,36 @@ ; WIN32-NEXT: subl %esi, %ebx ; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: subl %ecx, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: imull %eax, %ecx -; WIN32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl %esi, %edx -; WIN32-NEXT: subl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull %ebx, %edx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl %esi, %eax +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: imull %ebx, %eax +; WIN32-NEXT: addl %ecx, %eax ; WIN32-NEXT: movl (%esp), %ebx # 4-byte Reload -; WIN32-NEXT: subl %ebp, %ebx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: subl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: imull %ebx, %ecx -; WIN32-NEXT: addl %edx, %ecx +; WIN32-NEXT: addl %eax, %ecx ; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; WIN32-NEXT: addl (%esp), %ebp # 4-byte Folded Reload -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: imull %edx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: imull %ebp, %edi ; WIN32-NEXT: addl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; WIN32-NEXT: addl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: imull %ebp, %eax -; WIN32-NEXT: addl %esi, %eax -; WIN32-NEXT: addl %eax, %edi +; WIN32-NEXT: addl %esi, %edi +; WIN32-NEXT: addl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: imull %eax, %edx +; WIN32-NEXT: addl %edx, %edi ; WIN32-NEXT: addl %ecx, %edi -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; WIN32-NEXT: movl %edi, %eax -; WIN32-NEXT: addl $16, %esp +; WIN32-NEXT: addl $12, %esp ; WIN32-NEXT: popl %ebx ; WIN32-NEXT: popl %ebp ; WIN32-NEXT: retl @@ -271,18 +270,18 @@ ; WIN64-NEXT: # kill: def $r11d killed $r11d killed $r11 ; WIN64-NEXT: subl %r12d, %r11d ; WIN64-NEXT: imull %edx, %r11d +; WIN64-NEXT: addl %r9d, %r11d ; WIN64-NEXT: leal (%r14,%r15), %edx -; WIN64-NEXT: # kill: def $r14d killed $r14d killed $r14 -; WIN64-NEXT: subl %r15d, %r14d -; WIN64-NEXT: imull %esi, %r14d -; WIN64-NEXT: addl %r11d, %r14d +; WIN64-NEXT: movl %r14d, %r9d +; WIN64-NEXT: subl %r15d, %r9d +; WIN64-NEXT: imull %esi, %r9d +; WIN64-NEXT: addl %r11d, %r9d ; WIN64-NEXT: addl %ecx, %eax ; WIN64-NEXT: imull %r8d, %eax ; WIN64-NEXT: imull %ebx, %r10d +; WIN64-NEXT: addl %r10d, %eax ; WIN64-NEXT: imull %edi, %edx -; WIN64-NEXT: addl %r10d, %edx ; WIN64-NEXT: addl %edx, %eax -; WIN64-NEXT: addl %r14d, %eax ; WIN64-NEXT: addl %r9d, %eax ; WIN64-NEXT: popq %rbx ; WIN64-NEXT: retq @@ -312,19 +311,19 @@ ; LINUXOSX-NEXT: leal (%r13,%r14), %r11d ; LINUXOSX-NEXT: movl %r13d, %r12d ; LINUXOSX-NEXT: subl %r14d, %r12d -; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %r14d ; LINUXOSX-NEXT: imull %edx, %r12d -; LINUXOSX-NEXT: movl %r15d, %edx -; LINUXOSX-NEXT: subl %r14d, %edx -; LINUXOSX-NEXT: imull %esi, %edx -; LINUXOSX-NEXT: addl %r12d, %edx +; LINUXOSX-NEXT: movl {{[0-9]+}}(%rsp), %edx +; LINUXOSX-NEXT: addl %r9d, %r12d +; LINUXOSX-NEXT: movl %r15d, %r9d +; LINUXOSX-NEXT: subl %edx, %r9d +; LINUXOSX-NEXT: imull %esi, %r9d +; LINUXOSX-NEXT: addl %r12d, %r9d ; LINUXOSX-NEXT: addl %ecx, %eax ; LINUXOSX-NEXT: imull %r8d, %eax ; LINUXOSX-NEXT: imull %r10d, %r11d -; LINUXOSX-NEXT: addl %r15d, %r14d -; LINUXOSX-NEXT: imull %edi, %r14d -; LINUXOSX-NEXT: addl %r11d, %r14d -; LINUXOSX-NEXT: addl %r14d, %eax +; LINUXOSX-NEXT: addl %r11d, %eax +; LINUXOSX-NEXT: addl %r15d, %edx +; LINUXOSX-NEXT: imull %edi, %edx ; LINUXOSX-NEXT: addl %edx, %eax ; LINUXOSX-NEXT: addl %r9d, %eax ; LINUXOSX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll --- a/llvm/test/CodeGen/X86/stack-clash-large.ll +++ b/llvm/test/CodeGen/X86/stack-clash-large.ll @@ -98,13 +98,13 @@ ; CHECK-X64-NEXT: .cfi_def_cfa_offset 71888 ; CHECK-X64-NEXT: .cfi_offset %rax, -16 ; CHECK-X64-NEXT: movl 71888(%rsp), %eax +; CHECK-X64-NEXT: addl %esi, %edi ; CHECK-X64-NEXT: addl %ecx, %edx +; CHECK-X64-NEXT: addl %edi, %edx +; CHECK-X64-NEXT: addl %r9d, %r8d ; CHECK-X64-NEXT: addl 71896(%rsp), %eax -; CHECK-X64-NEXT: addl %esi, %edx -; CHECK-X64-NEXT: addl %r9d, %eax ; CHECK-X64-NEXT: addl %r8d, %eax ; CHECK-X64-NEXT: addl %edx, %eax -; CHECK-X64-NEXT: addl %edi, %eax ; CHECK-X64-NEXT: movl %eax, 264(%rsp) ; CHECK-X64-NEXT: movl %eax, 28664(%rsp) ; CHECK-X64-NEXT: addq $71872, %rsp # imm = 0x118C0 @@ -141,16 +141,16 @@ ; CHECK-X86-NEXT: .cfi_offset %edx, -12 ; CHECK-X86-NEXT: .cfi_offset %esi, -8 ; CHECK-X86-NEXT: movl 72056(%esp), %eax -; CHECK-X86-NEXT: movl 72048(%esp), %ecx -; CHECK-X86-NEXT: movl 72040(%esp), %edx +; CHECK-X86-NEXT: movl 72048(%esp), %edx +; CHECK-X86-NEXT: movl 72040(%esp), %ecx ; CHECK-X86-NEXT: movl 72032(%esp), %esi ; CHECK-X86-NEXT: addl 72036(%esp), %esi -; CHECK-X86-NEXT: addl 72044(%esp), %edx -; CHECK-X86-NEXT: addl 72052(%esp), %ecx +; CHECK-X86-NEXT: addl 72044(%esp), %ecx +; CHECK-X86-NEXT: addl %esi, %ecx +; CHECK-X86-NEXT: addl 72052(%esp), %edx ; CHECK-X86-NEXT: addl 72060(%esp), %eax -; CHECK-X86-NEXT: addl %ecx, %eax ; CHECK-X86-NEXT: addl %edx, %eax -; CHECK-X86-NEXT: addl %esi, %eax +; CHECK-X86-NEXT: addl %ecx, %eax ; CHECK-X86-NEXT: movl %eax, 392(%esp) ; CHECK-X86-NEXT: movl %eax, 28792(%esp) ; CHECK-X86-NEXT: addl $72012, %esp # imm = 0x1194C @@ -184,13 +184,13 @@ ; CHECK-X32-NEXT: .cfi_def_cfa_offset 71888 ; CHECK-X32-NEXT: .cfi_offset %rax, -16 ; CHECK-X32-NEXT: movl 71888(%esp), %eax +; CHECK-X32-NEXT: addl %esi, %edi ; CHECK-X32-NEXT: addl %ecx, %edx +; CHECK-X32-NEXT: addl %edi, %edx +; CHECK-X32-NEXT: addl %r9d, %r8d ; CHECK-X32-NEXT: addl 71896(%esp), %eax -; CHECK-X32-NEXT: addl %esi, %edx -; CHECK-X32-NEXT: addl %r9d, %eax ; CHECK-X32-NEXT: addl %r8d, %eax ; CHECK-X32-NEXT: addl %edx, %eax -; CHECK-X32-NEXT: addl %edi, %eax ; CHECK-X32-NEXT: movl %eax, 264(%esp) ; CHECK-X32-NEXT: movl %eax, 28664(%esp) ; CHECK-X32-NEXT: addl $71872, %esp # imm = 0x118C0 diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll --- a/llvm/test/CodeGen/X86/statepoint-live-in.ll +++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll @@ -442,82 +442,79 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %edi, %ebp -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edx, %r12d -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %r8d, %r14d -; CHECK-NEXT: movl %r9d, %r15d -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %edx, %r14d +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %r8d, %r12d +; CHECK-NEXT: movl %r9d, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: callq _bar ## 160-byte Folded Reload ; CHECK-NEXT: Ltmp13: -; CHECK-NEXT: addq %r12, %rbx -; CHECK-NEXT: addq %r13, %rbx -; CHECK-NEXT: addq %r14, %rbx -; CHECK-NEXT: addq %r15, %rbx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload +; CHECK-NEXT: addq %rax, %r14 +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload +; CHECK-NEXT: addq %r14, %r12 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r12 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r15 +; CHECK-NEXT: addq %r12, %r15 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %r15, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rbp +; CHECK-NEXT: addq %rbx, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r13 +; CHECK-NEXT: addq %rbp, %r13 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: addq %rbp, %rbx -; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: addq %r13, %rcx +; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: addq $168, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll --- a/llvm/test/CodeGen/X86/statepoint-regs.ll +++ b/llvm/test/CodeGen/X86/statepoint-regs.ll @@ -554,82 +554,79 @@ ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl %edi, %ebp -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edx, %r12d -; CHECK-NEXT: movl %ecx, %r13d -; CHECK-NEXT: movl %r8d, %r14d -; CHECK-NEXT: movl %r9d, %r15d -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %edx, %r14d +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %r8d, %r12d +; CHECK-NEXT: movl %r9d, %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r13d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ebx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r15d ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill ; CHECK-NEXT: callq _bar ## 160-byte Folded Reload ; CHECK-NEXT: Ltmp14: -; CHECK-NEXT: addq %r12, %rbx -; CHECK-NEXT: addq %r13, %rbx -; CHECK-NEXT: addq %r14, %rbx -; CHECK-NEXT: addq %r15, %rbx +; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Folded Reload +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload +; CHECK-NEXT: addq %rax, %r14 +; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Folded Reload +; CHECK-NEXT: addq %r14, %r12 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r12 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r15 +; CHECK-NEXT: addq %r12, %r15 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %r15, %rbx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rbp +; CHECK-NEXT: addq %rbx, %rbp ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r13 ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %r13 +; CHECK-NEXT: addq %rbp, %r13 +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx +; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax -; CHECK-NEXT: addq %rax, %rbx -; CHECK-NEXT: addq %rbp, %rbx -; CHECK-NEXT: movq %rbx, %rax +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: addq %r13, %rcx +; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: addq $168, %rsp ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 diff --git a/llvm/test/CodeGen/X86/swift-return.ll b/llvm/test/CodeGen/X86/swift-return.ll --- a/llvm/test/CodeGen/X86/swift-return.ll +++ b/llvm/test/CodeGen/X86/swift-return.ll @@ -147,9 +147,9 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: movl %edi, {{[0-9]+}}(%rsp) ; CHECK-NEXT: callq gen3@PLT -; CHECK-NEXT: # kill: def $edx killed $edx def $rdx ; CHECK-NEXT: # kill: def $ecx killed $ecx def $rcx -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: # kill: def $r8d killed $r8d def $r8 +; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: addl %r8d, %ecx ; CHECK-NEXT: addl %ecx, %eax ; CHECK-NEXT: popq %rcx @@ -360,7 +360,7 @@ ; CHECK-NEXT: addsd %xmm1, %xmm0 ; CHECK-NEXT: addsd %xmm2, %xmm0 ; CHECK-NEXT: addsd %xmm3, %xmm0 -; CHECK-NEXT: addq %rdx, %rcx +; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: addq %r8, %rcx ; CHECK-NEXT: addq %rcx, %rax ; CHECK-NEXT: popq %rcx diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll --- a/llvm/test/CodeGen/X86/umul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -93,7 +93,7 @@ ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %edi @@ -106,7 +106,7 @@ ; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl %edi, %esi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -142,12 +142,12 @@ ; X86-NEXT: addl %ebx, %esi ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl %ebp, %eax @@ -176,13 +176,13 @@ ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ecx ; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: adcl (%esp), %edi # 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: adcl (%esp), %ecx # 4-byte Folded Reload -; X86-NEXT: setb (%esp) # 1-byte Folded Spill +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi @@ -201,120 +201,123 @@ ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: adcl %edi, %esi -; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: setb (%esp) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %eax -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: movzbl (%esp), %esi # 1-byte Folded Reload ; X86-NEXT: adcl %esi, %edx ; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl %ecx, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: adcl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ecx, %edi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: setb %bl +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull %edi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: adcl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %esi, %ebx -; X86-NEXT: setb %cl +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %edi -; X86-NEXT: addl %ebx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movzbl %cl, %eax -; X86-NEXT: adcl %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: addl %esi, %ebp -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %esi -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: addl %ecx, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %edi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: adcl %ebx, %esi +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %esi, %ecx -; X86-NEXT: movzbl %bl, %eax -; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movzbl %bl, %ecx +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill -; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %ebp, %ebx ; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %ebx, %edi +; X86-NEXT: addl %ecx, %edi ; X86-NEXT: adcl $0, %esi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: mull %ebx -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movl %eax, %edi -; X86-NEXT: adcl %esi, %ebp +; X86-NEXT: adcl %esi, %ecx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebp, %esi +; X86-NEXT: addl %ecx, %esi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: addl %ecx, %ebp +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl (%esp), %esi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %edx -; X86-NEXT: addl %edx, %edi +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: imull {{[0-9]+}}(%esp), %edi ; X86-NEXT: addl %ecx, %edi ; X86-NEXT: movl %eax, %edx ; X86-NEXT: addl %esi, %edx @@ -365,7 +368,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %ebx @@ -377,23 +380,23 @@ ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ebp, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl %ebx, %edi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %edi, %ebx +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: addl %edi, %ebp ; X86-NEXT: movzbl %cl, %eax ; X86-NEXT: adcl %eax, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -404,45 +407,45 @@ ; X86-NEXT: addl %esi, %ecx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl %edx, %ebx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %esi -; X86-NEXT: addl %ebp, %esi +; X86-NEXT: addl %ebx, %esi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: adcl %eax, %edi -; X86-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edi ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: imull %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %ecx ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: addl %edx, %ebx +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx ; X86-NEXT: addl %ebp, %ebx ; X86-NEXT: addl %esi, %ecx ; X86-NEXT: adcl %edi, %ebx -; X86-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -453,15 +456,15 @@ ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %eax -; X86-NEXT: imull {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: imull %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edx ; X86-NEXT: movl %eax, %edi -; X86-NEXT: addl %edx, %esi +; X86-NEXT: addl %edx, %ebp +; X86-NEXT: imull {{[0-9]+}}(%esp), %esi ; X86-NEXT: addl %ebp, %esi ; X86-NEXT: addl %ecx, %edi ; X86-NEXT: adcl %ebx, %esi @@ -498,9 +501,9 @@ ; X86-NEXT: movl %esi, 8(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, 12(%edx) -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: movl %esi, 16(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 16(%edx) +; X86-NEXT: movl (%esp), %esi # 4-byte Reload ; X86-NEXT: movl %esi, 20(%edx) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, 24(%edx) @@ -527,14 +530,15 @@ ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %r8, %r11 -; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: movq %rcx, %r8 +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %r10, %rbp ; X64-NEXT: movq %rdx, %r14 @@ -549,13 +553,13 @@ ; X64-NEXT: adcq %r14, %r12 ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %r10d -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r15 ; X64-NEXT: movq %rax, %r13 ; X64-NEXT: addq %r12, %r13 ; X64-NEXT: adcq %r10, %r15 -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %rbp ; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rax, %r14 @@ -565,7 +569,7 @@ ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %r12, %r10 ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %r8, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rax, %r12 ; X64-NEXT: addq %r10, %r12 @@ -579,7 +583,7 @@ ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq %rax, %r15 -; X64-NEXT: movq %r8, %rax +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %rbp @@ -590,12 +594,12 @@ ; X64-NEXT: mulq %r10 ; X64-NEXT: addq %rbp, %rax ; X64-NEXT: adcq %r13, %rdx -; X64-NEXT: imulq %r10, %r8 -; X64-NEXT: addq %rdx, %r8 +; X64-NEXT: imulq %r10, %rcx +; X64-NEXT: addq %rdx, %rcx ; X64-NEXT: addq %r14, %r15 ; X64-NEXT: adcq %r12, %rax -; X64-NEXT: adcq %r11, %r8 -; X64-NEXT: imulq %r9, %rcx +; X64-NEXT: adcq %r11, %rcx +; X64-NEXT: imulq %r9, %r8 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rdx ; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rsi diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -185,8 +185,8 @@ ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi ; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %ecx, %esi ; X86-NEXT: addl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll --- a/llvm/test/CodeGen/X86/umul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -278,30 +278,28 @@ ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: testl %esi, %esi ; X86-NEXT: setne %dl ; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %bl -; X86-NEXT: andb %dl, %bl -; X86-NEXT: mull %ebp +; X86-NEXT: setne %cl +; X86-NEXT: andb %dl, %cl +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %edi -; X86-NEXT: seto %bh +; X86-NEXT: seto %bl ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: seto %cl -; X86-NEXT: orb %bh, %cl -; X86-NEXT: leal (%edi,%eax), %esi -; X86-NEXT: movl %edx, %eax ; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %edx -; X86-NEXT: setb %ch -; X86-NEXT: orb %cl, %ch +; X86-NEXT: seto %ch ; X86-NEXT: orb %bl, %ch +; X86-NEXT: orb %cl, %ch +; X86-NEXT: leal (%edi,%eax), %esi +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: addl %esi, %edx +; X86-NEXT: setb %cl +; X86-NEXT: orb %ch, %cl ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovnel %ecx, %eax ; X86-NEXT: cmovnel %ecx, %edx diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -19,13 +19,13 @@ ; X64-NEXT: mulq %rdi ; X64-NEXT: seto %r11b ; X64-NEXT: orb %r10b, %r11b +; X64-NEXT: orb %r9b, %r11b ; X64-NEXT: leaq (%rsi,%rax), %rcx ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: addq %rcx, %rdx ; X64-NEXT: setb %cl ; X64-NEXT: orb %r11b, %cl -; X64-NEXT: orb %r9b, %cl ; X64-NEXT: retq ; ; X86-LABEL: muloti_test: @@ -47,24 +47,24 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %ebx +; X86-NEXT: mull %ebp ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: seto (%esp) # 1-byte Folded Spill ; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %esi ; X86-NEXT: leal (%ecx,%eax), %ecx -; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: seto %bh ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %esi +; X86-NEXT: mull %ebp +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ecx, %ebp ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: mull %esi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -72,18 +72,18 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %edi ; X86-NEXT: leal (%ecx,%eax), %ecx -; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: seto %bl ; X86-NEXT: movl %edi, %eax -; X86-NEXT: mull %ebx -; X86-NEXT: movl %eax, %ebx +; X86-NEXT: mull %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ecx, %edi ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: addl %esi, %ebx +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ebp, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: mull %ecx +; X86-NEXT: mull %esi +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -106,7 +106,7 @@ ; X86-NEXT: addl %esi, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl %ebx, %eax +; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl %edi, %edx ; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: testl %ebp, %ebp @@ -115,23 +115,21 @@ ; X86-NEXT: testl %esi, %esi ; X86-NEXT: setne %ch ; X86-NEXT: andb %cl, %ch -; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: orb %ch, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: orb (%esp), %bh # 1-byte Folded Reload +; X86-NEXT: orb %ch, %bh +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload +; X86-NEXT: movb %bh, (%esp) # 1-byte Spill ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X86-NEXT: setne %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: testl %edi, %edi -; X86-NEXT: setne %bh -; X86-NEXT: andb %cl, %bh -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: setne %ch +; X86-NEXT: andb %cl, %ch +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload +; X86-NEXT: orb %ch, %bl +; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT: setne %bl +; X86-NEXT: setne %bh ; X86-NEXT: orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload @@ -141,13 +139,12 @@ ; X86-NEXT: movl %eax, 8(%ecx) ; X86-NEXT: movl %edx, 12(%ecx) ; X86-NEXT: setne %al -; X86-NEXT: andb %bl, %al +; X86-NEXT: andb %bh, %al +; X86-NEXT: orb %bl, %al +; X86-NEXT: orb (%esp), %al # 1-byte Folded Reload ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload -; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload -; X86-NEXT: orb %al, %bh -; X86-NEXT: andb $1, %bh -; X86-NEXT: movb %bh, 16(%ecx) +; X86-NEXT: andb $1, %al +; X86-NEXT: movb %al, 16(%ecx) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: addl $24, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 diff --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll --- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll @@ -16,30 +16,28 @@ ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: testl %esi, %esi ; X86-NEXT: setne %dl ; X86-NEXT: testl %eax, %eax -; X86-NEXT: setne %bl -; X86-NEXT: andb %dl, %bl -; X86-NEXT: mull %ebp +; X86-NEXT: setne %cl +; X86-NEXT: andb %dl, %cl +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %edi -; X86-NEXT: seto %bh +; X86-NEXT: seto %bl ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull %ecx -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: mull %ebp ; X86-NEXT: seto %ch -; X86-NEXT: orb %bh, %ch +; X86-NEXT: orb %bl, %ch +; X86-NEXT: orb %cl, %ch ; X86-NEXT: leal (%edi,%eax), %esi -; X86-NEXT: movl %edx, %eax -; X86-NEXT: mull %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %esi, %edx ; X86-NEXT: setb %cl ; X86-NEXT: orb %ch, %cl -; X86-NEXT: orb %bl, %cl ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-nonzero.ll @@ -295,21 +295,19 @@ define i1 @t64_3_2(i64 %X) nounwind { ; X86-LABEL: t64_3_2: ; X86: # %bb.0: -; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl $-1431655765, %edx # imm = 0xAAAAAAAB ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %edx ; X86-NEXT: imull $-1431655766, %ecx, %ecx # imm = 0xAAAAAAAA -; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %esi # imm = 0xAAAAAAAB -; X86-NEXT: addl %ecx, %esi -; X86-NEXT: addl %edx, %esi +; X86-NEXT: addl %edx, %ecx +; X86-NEXT: imull $-1431655765, {{[0-9]+}}(%esp), %edx # imm = 0xAAAAAAAB +; X86-NEXT: addl %ecx, %edx ; X86-NEXT: addl $-1431655766, %eax # imm = 0xAAAAAAAA -; X86-NEXT: adcl $-1431655766, %esi # imm = 0xAAAAAAAA +; X86-NEXT: adcl $-1431655766, %edx # imm = 0xAAAAAAAA ; X86-NEXT: cmpl $1431655765, %eax # imm = 0x55555555 -; X86-NEXT: sbbl $1431655765, %esi # imm = 0x55555555 +; X86-NEXT: sbbl $1431655765, %edx # imm = 0x55555555 ; X86-NEXT: setb %al -; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: t64_3_2: diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -235,9 +235,9 @@ ; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X32-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X32-AVX2-NEXT: retl ; @@ -247,9 +247,9 @@ ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq entry: @@ -271,8 +271,8 @@ ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X32-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X32-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X32-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X32-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X32-NEXT: retl ; @@ -284,8 +284,8 @@ ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -295,9 +295,9 @@ ; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 @@ -310,9 +310,9 @@ ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 @@ -339,8 +339,8 @@ ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X32-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 -; X32-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X32-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X32-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X32-NEXT: vandps %ymm0, %ymm1, %ymm0 @@ -354,8 +354,8 @@ ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 @@ -367,15 +367,15 @@ ; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] +; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X32-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X32-AVX2-NEXT: retl ; @@ -385,15 +385,15 @@ ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] +; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq entry: @@ -481,9 +481,9 @@ ; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 +; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X32-AVX2-NEXT: retl ; @@ -493,9 +493,9 @@ ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq entry: @@ -517,8 +517,8 @@ ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X32-NEXT: vorps %ymm3, %ymm2, %ymm2 +; X32-NEXT: vorps %ymm2, %ymm1, %ymm1 ; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X32-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X32-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X32-NEXT: retl ; @@ -530,8 +530,8 @@ ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-NEXT: vorps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vorps %ymm2, %ymm1, %ymm1 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; @@ -541,9 +541,9 @@ ; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X32-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 @@ -556,9 +556,9 @@ ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 @@ -585,8 +585,8 @@ ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X32-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 -; X32-NEXT: vorps %ymm3, %ymm2, %ymm2 +; X32-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X32-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X32-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -600,8 +600,8 @@ ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; X64-NEXT: vorps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -613,15 +613,15 @@ ; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X32-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X32-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 +; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] +; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X32-AVX2-NEXT: retl ; @@ -631,15 +631,15 @@ ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] +; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq entry: @@ -789,11 +789,11 @@ ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X32-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 +; X32-NEXT: vorps %ymm1, %ymm2, %ymm1 +; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X32-NEXT: vandps %ymm0, %ymm3, %ymm0 +; X32-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X32-NEXT: vorps %ymm0, %ymm1, %ymm0 -; X32-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: five_or_and: @@ -804,11 +804,11 @@ ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; X64-NEXT: vorps %ymm1, %ymm2, %ymm1 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm3, %ymm0 +; X64-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 -; X64-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X64-NEXT: retq ; ; X32-AVX2-LABEL: five_or_and: @@ -820,13 +820,13 @@ ; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0 -; X32-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0 +; X32-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] +; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] +; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 -; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X32-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: five_or_and: @@ -838,13 +838,13 @@ ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0 +; X64-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] +; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: retq entry: %cmp = fcmp oge <8 x float> %x, @@ -936,10 +936,10 @@ ; X32-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm4 +; X32-NEXT: vxorps %ymm3, %ymm2, %ymm2 +; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 ; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X32-NEXT: vandps %ymm0, %ymm4, %ymm0 -; X32-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; X32-NEXT: vandps %ymm0, %ymm3, %ymm0 ; X32-NEXT: vxorps %ymm0, %ymm2, %ymm0 ; X32-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X32-NEXT: retl @@ -951,10 +951,10 @@ ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; X64-NEXT: vxorps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vandps %ymm0, %ymm4, %ymm0 -; X64-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; X64-NEXT: vandps %ymm0, %ymm3, %ymm0 ; X64-NEXT: vxorps %ymm0, %ymm2, %ymm0 ; X64-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq @@ -967,12 +967,12 @@ ; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 ; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4 -; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X32-AVX2-NEXT: vcmpneqps %ymm5, %ymm0, %ymm0 -; X32-AVX2-NEXT: vandps %ymm0, %ymm4, %ymm0 -; X32-AVX2-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; X32-AVX2-NEXT: vxorps %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] +; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] +; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0 +; X32-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0 ; X32-AVX2-NEXT: vxorps %ymm0, %ymm2, %ymm0 ; X32-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X32-AVX2-NEXT: retl @@ -985,12 +985,12 @@ ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm5, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm4, %ymm0 -; X64-AVX2-NEXT: vxorps %ymm0, %ymm3, %ymm0 +; X64-AVX2-NEXT: vxorps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] +; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0 ; X64-AVX2-NEXT: vxorps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: retq @@ -1015,12 +1015,12 @@ ; X32-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm4 -; X32-NEXT: vandps %ymm4, %ymm3, %ymm3 ; X32-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 +; X32-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 +; X32-NEXT: vxorps %ymm1, %ymm3, %ymm1 ; X32-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 -; X32-NEXT: vxorps %ymm1, %ymm2, %ymm1 ; X32-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X32-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X32-NEXT: retl @@ -1032,12 +1032,12 @@ ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 -; X64-NEXT: vandps %ymm4, %ymm3, %ymm3 ; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; X64-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; X64-NEXT: vxorps %ymm1, %ymm3, %ymm1 ; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 -; X64-NEXT: vxorps %ymm1, %ymm2, %ymm1 ; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq @@ -1050,14 +1050,14 @@ ; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 ; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4 -; X32-AVX2-NEXT: vandps %ymm4, %ymm3, %ymm3 ; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] +; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] +; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X32-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1 ; X32-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X32-AVX2-NEXT: vxorps %ymm1, %ymm2, %ymm1 ; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1] ; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -1071,14 +1071,14 @@ ; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4 -; X64-AVX2-NEXT: vandps %ymm4, %ymm3, %ymm3 ; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X64-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1 ; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vxorps %ymm1, %ymm2, %ymm1 ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1] ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -3304,108 +3304,111 @@ ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movq %r8, %r14 -; SSE2-NEXT: movq %rcx, %r13 +; SSE2-NEXT: movq %r8, %r15 ; SSE2-NEXT: movq %rdx, %r8 ; SSE2-NEXT: movq %rsi, %r11 ; SSE2-NEXT: movq %rdi, %r10 ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE2-NEXT: movq %r11, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movq %r9, %r15 -; SSE2-NEXT: imulq %rcx, %r15 -; SSE2-NEXT: movq %r14, %rax -; SSE2-NEXT: mulq %rcx -; SSE2-NEXT: movq %rax, %rdi -; SSE2-NEXT: addq %rax, %r15 -; SSE2-NEXT: addq %rdx, %r15 +; SSE2-NEXT: movq %r11, %rdi +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: movq %r9, %rbx +; SSE2-NEXT: imulq %rdi, %rbx +; SSE2-NEXT: movq %r15, %rax +; SSE2-NEXT: mulq %rdi +; SSE2-NEXT: movq %rdx, %rdi +; SSE2-NEXT: movq %rax, %r12 +; SSE2-NEXT: addq %rax, %rdi +; SSE2-NEXT: addq %rbx, %rdi ; SSE2-NEXT: movq %r9, %rax ; SSE2-NEXT: sarq $63, %rax -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: imulq %r11, %rcx +; SSE2-NEXT: movq %rax, %r13 +; SSE2-NEXT: imulq %r11, %r13 ; SSE2-NEXT: mulq %r10 -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: addq %rax, %rcx -; SSE2-NEXT: addq %rdx, %rcx -; SSE2-NEXT: addq %rdi, %rbx -; SSE2-NEXT: adcq %r15, %rcx +; SSE2-NEXT: movq %rax, %r14 +; SSE2-NEXT: movq %rdx, %rbx +; SSE2-NEXT: addq %r13, %rbx +; SSE2-NEXT: addq %rax, %rbx +; SSE2-NEXT: addq %r12, %r14 +; SSE2-NEXT: adcq %rdi, %rbx ; SSE2-NEXT: movq %r10, %rax -; SSE2-NEXT: mulq %r14 -; SSE2-NEXT: movq %rdx, %r15 +; SSE2-NEXT: mulq %r15 +; SSE2-NEXT: movq %rdx, %r12 ; SSE2-NEXT: movq %rax, %rdi ; SSE2-NEXT: movq %r11, %rax -; SSE2-NEXT: mulq %r14 -; SSE2-NEXT: movq %rdx, %r14 -; SSE2-NEXT: movq %rax, %r12 -; SSE2-NEXT: addq %r15, %r12 -; SSE2-NEXT: adcq $0, %r14 +; SSE2-NEXT: mulq %r15 +; SSE2-NEXT: movq %rdx, %r15 +; SSE2-NEXT: movq %rax, %r13 +; SSE2-NEXT: addq %r12, %r13 +; SSE2-NEXT: adcq $0, %r15 ; SSE2-NEXT: movq %r10, %rax ; SSE2-NEXT: mulq %r9 -; SSE2-NEXT: movq %rdx, %r15 +; SSE2-NEXT: movq %rdx, %r12 ; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: addq %r12, %r10 -; SSE2-NEXT: adcq %r14, %r15 +; SSE2-NEXT: addq %r13, %r10 +; SSE2-NEXT: adcq %r15, %r12 ; SSE2-NEXT: setb %al -; SSE2-NEXT: movzbl %al, %r14d +; SSE2-NEXT: movzbl %al, %r15d ; SSE2-NEXT: movq %r11, %rax ; SSE2-NEXT: mulq %r9 -; SSE2-NEXT: addq %r15, %rax -; SSE2-NEXT: adcq %r14, %rdx -; SSE2-NEXT: addq %rbx, %rax -; SSE2-NEXT: adcq %rcx, %rdx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSE2-NEXT: movq %r10, 8(%r15) +; SSE2-NEXT: addq %r12, %rax +; SSE2-NEXT: adcq %r15, %rdx +; SSE2-NEXT: addq %r14, %rax +; SSE2-NEXT: adcq %rbx, %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE2-NEXT: movq %r10, 8(%r12) ; SSE2-NEXT: sarq $63, %r10 ; SSE2-NEXT: xorq %r10, %rdx ; SSE2-NEXT: xorq %rax, %r10 -; SSE2-NEXT: xorl %ecx, %ecx +; SSE2-NEXT: xorl %r15d, %r15d ; SSE2-NEXT: orq %rdx, %r10 -; SSE2-NEXT: setne %cl -; SSE2-NEXT: movq %r13, %r9 +; SSE2-NEXT: setne %r15b +; SSE2-NEXT: movq %rcx, %r9 ; SSE2-NEXT: sarq $63, %r9 ; SSE2-NEXT: movq %rbp, %r11 ; SSE2-NEXT: imulq %r9, %r11 ; SSE2-NEXT: movq %rsi, %rax ; SSE2-NEXT: mulq %r9 -; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: addq %rax, %r11 -; SSE2-NEXT: addq %rdx, %r11 +; SSE2-NEXT: movq %rdx, %r9 +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: addq %rax, %r9 +; SSE2-NEXT: addq %r11, %r9 ; SSE2-NEXT: movq %rbp, %rax ; SSE2-NEXT: sarq $63, %rax ; SSE2-NEXT: movq %rax, %r14 -; SSE2-NEXT: imulq %r13, %r14 +; SSE2-NEXT: imulq %rcx, %r14 ; SSE2-NEXT: mulq %r8 -; SSE2-NEXT: movq %rax, %r10 -; SSE2-NEXT: addq %rax, %r14 -; SSE2-NEXT: addq %rdx, %r14 -; SSE2-NEXT: addq %r9, %r10 -; SSE2-NEXT: adcq %r11, %r14 +; SSE2-NEXT: movq %rax, %r11 +; SSE2-NEXT: movq %rdx, %rbx +; SSE2-NEXT: addq %r14, %rbx +; SSE2-NEXT: addq %rax, %rbx +; SSE2-NEXT: addq %r10, %r11 +; SSE2-NEXT: adcq %r9, %rbx ; SSE2-NEXT: movq %r8, %rax ; SSE2-NEXT: mulq %rsi ; SSE2-NEXT: movq %rdx, %r9 -; SSE2-NEXT: movq %rax, %r11 -; SSE2-NEXT: movq %r13, %rax +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rsi ; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: movq %rax, %rbx -; SSE2-NEXT: addq %r9, %rbx +; SSE2-NEXT: movq %rax, %r14 +; SSE2-NEXT: addq %r9, %r14 ; SSE2-NEXT: adcq $0, %rsi ; SSE2-NEXT: movq %r8, %rax ; SSE2-NEXT: mulq %rbp ; SSE2-NEXT: movq %rdx, %r8 ; SSE2-NEXT: movq %rax, %r9 -; SSE2-NEXT: addq %rbx, %r9 +; SSE2-NEXT: addq %r14, %r9 ; SSE2-NEXT: adcq %rsi, %r8 ; SSE2-NEXT: setb %al ; SSE2-NEXT: movzbl %al, %esi -; SSE2-NEXT: movq %r13, %rax +; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %rbp ; SSE2-NEXT: addq %r8, %rax ; SSE2-NEXT: adcq %rsi, %rdx -; SSE2-NEXT: addq %r10, %rax -; SSE2-NEXT: adcq %r14, %rdx -; SSE2-NEXT: movq %r9, 24(%r15) +; SSE2-NEXT: addq %r11, %rax +; SSE2-NEXT: adcq %rbx, %rdx +; SSE2-NEXT: movq %r9, 24(%r12) ; SSE2-NEXT: sarq $63, %r9 ; SSE2-NEXT: xorq %r9, %rdx ; SSE2-NEXT: xorq %rax, %r9 @@ -3414,11 +3417,11 @@ ; SSE2-NEXT: setne %al ; SSE2-NEXT: negl %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: negl %ecx -; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: negl %r15d +; SSE2-NEXT: movd %r15d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %r11, 16(%r15) -; SSE2-NEXT: movq %rdi, (%r15) +; SSE2-NEXT: movq %r10, 16(%r12) +; SSE2-NEXT: movq %rdi, (%r12) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -3435,108 +3438,111 @@ ; SSSE3-NEXT: pushq %r13 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movq %r8, %r14 -; SSSE3-NEXT: movq %rcx, %r13 +; SSSE3-NEXT: movq %r8, %r15 ; SSSE3-NEXT: movq %rdx, %r8 ; SSSE3-NEXT: movq %rsi, %r11 ; SSSE3-NEXT: movq %rdi, %r10 ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSSE3-NEXT: movq %r11, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movq %r9, %r15 -; SSSE3-NEXT: imulq %rcx, %r15 -; SSSE3-NEXT: movq %r14, %rax -; SSSE3-NEXT: mulq %rcx -; SSSE3-NEXT: movq %rax, %rdi -; SSSE3-NEXT: addq %rax, %r15 -; SSSE3-NEXT: addq %rdx, %r15 +; SSSE3-NEXT: movq %r11, %rdi +; SSSE3-NEXT: sarq $63, %rdi +; SSSE3-NEXT: movq %r9, %rbx +; SSSE3-NEXT: imulq %rdi, %rbx +; SSSE3-NEXT: movq %r15, %rax +; SSSE3-NEXT: mulq %rdi +; SSSE3-NEXT: movq %rdx, %rdi +; SSSE3-NEXT: movq %rax, %r12 +; SSSE3-NEXT: addq %rax, %rdi +; SSSE3-NEXT: addq %rbx, %rdi ; SSSE3-NEXT: movq %r9, %rax ; SSSE3-NEXT: sarq $63, %rax -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: imulq %r11, %rcx +; SSSE3-NEXT: movq %rax, %r13 +; SSSE3-NEXT: imulq %r11, %r13 ; SSSE3-NEXT: mulq %r10 -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: addq %rax, %rcx -; SSSE3-NEXT: addq %rdx, %rcx -; SSSE3-NEXT: addq %rdi, %rbx -; SSSE3-NEXT: adcq %r15, %rcx +; SSSE3-NEXT: movq %rax, %r14 +; SSSE3-NEXT: movq %rdx, %rbx +; SSSE3-NEXT: addq %r13, %rbx +; SSSE3-NEXT: addq %rax, %rbx +; SSSE3-NEXT: addq %r12, %r14 +; SSSE3-NEXT: adcq %rdi, %rbx ; SSSE3-NEXT: movq %r10, %rax -; SSSE3-NEXT: mulq %r14 -; SSSE3-NEXT: movq %rdx, %r15 +; SSSE3-NEXT: mulq %r15 +; SSSE3-NEXT: movq %rdx, %r12 ; SSSE3-NEXT: movq %rax, %rdi ; SSSE3-NEXT: movq %r11, %rax -; SSSE3-NEXT: mulq %r14 -; SSSE3-NEXT: movq %rdx, %r14 -; SSSE3-NEXT: movq %rax, %r12 -; SSSE3-NEXT: addq %r15, %r12 -; SSSE3-NEXT: adcq $0, %r14 +; SSSE3-NEXT: mulq %r15 +; SSSE3-NEXT: movq %rdx, %r15 +; SSSE3-NEXT: movq %rax, %r13 +; SSSE3-NEXT: addq %r12, %r13 +; SSSE3-NEXT: adcq $0, %r15 ; SSSE3-NEXT: movq %r10, %rax ; SSSE3-NEXT: mulq %r9 -; SSSE3-NEXT: movq %rdx, %r15 +; SSSE3-NEXT: movq %rdx, %r12 ; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: addq %r12, %r10 -; SSSE3-NEXT: adcq %r14, %r15 +; SSSE3-NEXT: addq %r13, %r10 +; SSSE3-NEXT: adcq %r15, %r12 ; SSSE3-NEXT: setb %al -; SSSE3-NEXT: movzbl %al, %r14d +; SSSE3-NEXT: movzbl %al, %r15d ; SSSE3-NEXT: movq %r11, %rax ; SSSE3-NEXT: mulq %r9 -; SSSE3-NEXT: addq %r15, %rax -; SSSE3-NEXT: adcq %r14, %rdx -; SSSE3-NEXT: addq %rbx, %rax -; SSSE3-NEXT: adcq %rcx, %rdx -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSSE3-NEXT: movq %r10, 8(%r15) +; SSSE3-NEXT: addq %r12, %rax +; SSSE3-NEXT: adcq %r15, %rdx +; SSSE3-NEXT: addq %r14, %rax +; SSSE3-NEXT: adcq %rbx, %rdx +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSSE3-NEXT: movq %r10, 8(%r12) ; SSSE3-NEXT: sarq $63, %r10 ; SSSE3-NEXT: xorq %r10, %rdx ; SSSE3-NEXT: xorq %rax, %r10 -; SSSE3-NEXT: xorl %ecx, %ecx +; SSSE3-NEXT: xorl %r15d, %r15d ; SSSE3-NEXT: orq %rdx, %r10 -; SSSE3-NEXT: setne %cl -; SSSE3-NEXT: movq %r13, %r9 +; SSSE3-NEXT: setne %r15b +; SSSE3-NEXT: movq %rcx, %r9 ; SSSE3-NEXT: sarq $63, %r9 ; SSSE3-NEXT: movq %rbp, %r11 ; SSSE3-NEXT: imulq %r9, %r11 ; SSSE3-NEXT: movq %rsi, %rax ; SSSE3-NEXT: mulq %r9 -; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: addq %rax, %r11 -; SSSE3-NEXT: addq %rdx, %r11 +; SSSE3-NEXT: movq %rdx, %r9 +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: addq %rax, %r9 +; SSSE3-NEXT: addq %r11, %r9 ; SSSE3-NEXT: movq %rbp, %rax ; SSSE3-NEXT: sarq $63, %rax ; SSSE3-NEXT: movq %rax, %r14 -; SSSE3-NEXT: imulq %r13, %r14 +; SSSE3-NEXT: imulq %rcx, %r14 ; SSSE3-NEXT: mulq %r8 -; SSSE3-NEXT: movq %rax, %r10 -; SSSE3-NEXT: addq %rax, %r14 -; SSSE3-NEXT: addq %rdx, %r14 -; SSSE3-NEXT: addq %r9, %r10 -; SSSE3-NEXT: adcq %r11, %r14 +; SSSE3-NEXT: movq %rax, %r11 +; SSSE3-NEXT: movq %rdx, %rbx +; SSSE3-NEXT: addq %r14, %rbx +; SSSE3-NEXT: addq %rax, %rbx +; SSSE3-NEXT: addq %r10, %r11 +; SSSE3-NEXT: adcq %r9, %rbx ; SSSE3-NEXT: movq %r8, %rax ; SSSE3-NEXT: mulq %rsi ; SSSE3-NEXT: movq %rdx, %r9 -; SSSE3-NEXT: movq %rax, %r11 -; SSSE3-NEXT: movq %r13, %rax +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movq %rcx, %rax ; SSSE3-NEXT: mulq %rsi ; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: movq %rax, %rbx -; SSSE3-NEXT: addq %r9, %rbx +; SSSE3-NEXT: movq %rax, %r14 +; SSSE3-NEXT: addq %r9, %r14 ; SSSE3-NEXT: adcq $0, %rsi ; SSSE3-NEXT: movq %r8, %rax ; SSSE3-NEXT: mulq %rbp ; SSSE3-NEXT: movq %rdx, %r8 ; SSSE3-NEXT: movq %rax, %r9 -; SSSE3-NEXT: addq %rbx, %r9 +; SSSE3-NEXT: addq %r14, %r9 ; SSSE3-NEXT: adcq %rsi, %r8 ; SSSE3-NEXT: setb %al ; SSSE3-NEXT: movzbl %al, %esi -; SSSE3-NEXT: movq %r13, %rax +; SSSE3-NEXT: movq %rcx, %rax ; SSSE3-NEXT: mulq %rbp ; SSSE3-NEXT: addq %r8, %rax ; SSSE3-NEXT: adcq %rsi, %rdx -; SSSE3-NEXT: addq %r10, %rax -; SSSE3-NEXT: adcq %r14, %rdx -; SSSE3-NEXT: movq %r9, 24(%r15) +; SSSE3-NEXT: addq %r11, %rax +; SSSE3-NEXT: adcq %rbx, %rdx +; SSSE3-NEXT: movq %r9, 24(%r12) ; SSSE3-NEXT: sarq $63, %r9 ; SSSE3-NEXT: xorq %r9, %rdx ; SSSE3-NEXT: xorq %rax, %r9 @@ -3545,11 +3551,11 @@ ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: negl %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: negl %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: negl %r15d +; SSSE3-NEXT: movd %r15d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %r11, 16(%r15) -; SSSE3-NEXT: movq %rdi, (%r15) +; SSSE3-NEXT: movq %r10, 16(%r12) +; SSSE3-NEXT: movq %rdi, (%r12) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 @@ -3566,108 +3572,111 @@ ; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: movq %r8, %r14 -; SSE41-NEXT: movq %rcx, %r13 +; SSE41-NEXT: movq %r8, %r15 ; SSE41-NEXT: movq %rdx, %r8 ; SSE41-NEXT: movq %rsi, %r11 ; SSE41-NEXT: movq %rdi, %r10 ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE41-NEXT: movq %r11, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %r9, %r15 -; SSE41-NEXT: imulq %rcx, %r15 -; SSE41-NEXT: movq %r14, %rax -; SSE41-NEXT: mulq %rcx -; SSE41-NEXT: movq %rax, %rdi -; SSE41-NEXT: addq %rax, %r15 -; SSE41-NEXT: addq %rdx, %r15 +; SSE41-NEXT: movq %r11, %rdi +; SSE41-NEXT: sarq $63, %rdi +; SSE41-NEXT: movq %r9, %rbx +; SSE41-NEXT: imulq %rdi, %rbx +; SSE41-NEXT: movq %r15, %rax +; SSE41-NEXT: mulq %rdi +; SSE41-NEXT: movq %rdx, %rdi +; SSE41-NEXT: movq %rax, %r12 +; SSE41-NEXT: addq %rax, %rdi +; SSE41-NEXT: addq %rbx, %rdi ; SSE41-NEXT: movq %r9, %rax ; SSE41-NEXT: sarq $63, %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: imulq %r11, %rcx +; SSE41-NEXT: movq %rax, %r13 +; SSE41-NEXT: imulq %r11, %r13 ; SSE41-NEXT: mulq %r10 -; SSE41-NEXT: movq %rax, %rbx -; SSE41-NEXT: addq %rax, %rcx -; SSE41-NEXT: addq %rdx, %rcx -; SSE41-NEXT: addq %rdi, %rbx -; SSE41-NEXT: adcq %r15, %rcx +; SSE41-NEXT: movq %rax, %r14 +; SSE41-NEXT: movq %rdx, %rbx +; SSE41-NEXT: addq %r13, %rbx +; SSE41-NEXT: addq %rax, %rbx +; SSE41-NEXT: addq %r12, %r14 +; SSE41-NEXT: adcq %rdi, %rbx ; SSE41-NEXT: movq %r10, %rax -; SSE41-NEXT: mulq %r14 -; SSE41-NEXT: movq %rdx, %r15 +; SSE41-NEXT: mulq %r15 +; SSE41-NEXT: movq %rdx, %r12 ; SSE41-NEXT: movq %rax, %rdi ; SSE41-NEXT: movq %r11, %rax -; SSE41-NEXT: mulq %r14 -; SSE41-NEXT: movq %rdx, %r14 -; SSE41-NEXT: movq %rax, %r12 -; SSE41-NEXT: addq %r15, %r12 -; SSE41-NEXT: adcq $0, %r14 +; SSE41-NEXT: mulq %r15 +; SSE41-NEXT: movq %rdx, %r15 +; SSE41-NEXT: movq %rax, %r13 +; SSE41-NEXT: addq %r12, %r13 +; SSE41-NEXT: adcq $0, %r15 ; SSE41-NEXT: movq %r10, %rax ; SSE41-NEXT: mulq %r9 -; SSE41-NEXT: movq %rdx, %r15 +; SSE41-NEXT: movq %rdx, %r12 ; SSE41-NEXT: movq %rax, %r10 -; SSE41-NEXT: addq %r12, %r10 -; SSE41-NEXT: adcq %r14, %r15 +; SSE41-NEXT: addq %r13, %r10 +; SSE41-NEXT: adcq %r15, %r12 ; SSE41-NEXT: setb %al -; SSE41-NEXT: movzbl %al, %r14d +; SSE41-NEXT: movzbl %al, %r15d ; SSE41-NEXT: movq %r11, %rax ; SSE41-NEXT: mulq %r9 -; SSE41-NEXT: addq %r15, %rax -; SSE41-NEXT: adcq %r14, %rdx -; SSE41-NEXT: addq %rbx, %rax -; SSE41-NEXT: adcq %rcx, %rdx -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSE41-NEXT: movq %r10, 8(%r15) +; SSE41-NEXT: addq %r12, %rax +; SSE41-NEXT: adcq %r15, %rdx +; SSE41-NEXT: addq %r14, %rax +; SSE41-NEXT: adcq %rbx, %rdx +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE41-NEXT: movq %r10, 8(%r12) ; SSE41-NEXT: sarq $63, %r10 ; SSE41-NEXT: xorq %r10, %rdx ; SSE41-NEXT: xorq %rax, %r10 -; SSE41-NEXT: xorl %ecx, %ecx +; SSE41-NEXT: xorl %r15d, %r15d ; SSE41-NEXT: orq %rdx, %r10 -; SSE41-NEXT: setne %cl -; SSE41-NEXT: movq %r13, %r9 +; SSE41-NEXT: setne %r15b +; SSE41-NEXT: movq %rcx, %r9 ; SSE41-NEXT: sarq $63, %r9 ; SSE41-NEXT: movq %rbp, %r11 ; SSE41-NEXT: imulq %r9, %r11 ; SSE41-NEXT: movq %rsi, %rax ; SSE41-NEXT: mulq %r9 -; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: addq %rax, %r11 -; SSE41-NEXT: addq %rdx, %r11 +; SSE41-NEXT: movq %rdx, %r9 +; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: addq %rax, %r9 +; SSE41-NEXT: addq %r11, %r9 ; SSE41-NEXT: movq %rbp, %rax ; SSE41-NEXT: sarq $63, %rax ; SSE41-NEXT: movq %rax, %r14 -; SSE41-NEXT: imulq %r13, %r14 +; SSE41-NEXT: imulq %rcx, %r14 ; SSE41-NEXT: mulq %r8 -; SSE41-NEXT: movq %rax, %r10 -; SSE41-NEXT: addq %rax, %r14 -; SSE41-NEXT: addq %rdx, %r14 -; SSE41-NEXT: addq %r9, %r10 -; SSE41-NEXT: adcq %r11, %r14 +; SSE41-NEXT: movq %rax, %r11 +; SSE41-NEXT: movq %rdx, %rbx +; SSE41-NEXT: addq %r14, %rbx +; SSE41-NEXT: addq %rax, %rbx +; SSE41-NEXT: addq %r10, %r11 +; SSE41-NEXT: adcq %r9, %rbx ; SSE41-NEXT: movq %r8, %rax ; SSE41-NEXT: mulq %rsi ; SSE41-NEXT: movq %rdx, %r9 -; SSE41-NEXT: movq %rax, %r11 -; SSE41-NEXT: movq %r13, %rax +; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %rsi ; SSE41-NEXT: movq %rdx, %rsi -; SSE41-NEXT: movq %rax, %rbx -; SSE41-NEXT: addq %r9, %rbx +; SSE41-NEXT: movq %rax, %r14 +; SSE41-NEXT: addq %r9, %r14 ; SSE41-NEXT: adcq $0, %rsi ; SSE41-NEXT: movq %r8, %rax ; SSE41-NEXT: mulq %rbp ; SSE41-NEXT: movq %rdx, %r8 ; SSE41-NEXT: movq %rax, %r9 -; SSE41-NEXT: addq %rbx, %r9 +; SSE41-NEXT: addq %r14, %r9 ; SSE41-NEXT: adcq %rsi, %r8 ; SSE41-NEXT: setb %al ; SSE41-NEXT: movzbl %al, %esi -; SSE41-NEXT: movq %r13, %rax +; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %rbp ; SSE41-NEXT: addq %r8, %rax ; SSE41-NEXT: adcq %rsi, %rdx -; SSE41-NEXT: addq %r10, %rax -; SSE41-NEXT: adcq %r14, %rdx -; SSE41-NEXT: movq %r9, 24(%r15) +; SSE41-NEXT: addq %r11, %rax +; SSE41-NEXT: adcq %rbx, %rdx +; SSE41-NEXT: movq %r9, 24(%r12) ; SSE41-NEXT: sarq $63, %r9 ; SSE41-NEXT: xorq %r9, %rdx ; SSE41-NEXT: xorq %rax, %r9 @@ -3675,11 +3684,11 @@ ; SSE41-NEXT: orq %rdx, %r9 ; SSE41-NEXT: setne %al ; SSE41-NEXT: negl %eax -; SSE41-NEXT: negl %ecx -; SSE41-NEXT: movd %ecx, %xmm0 +; SSE41-NEXT: negl %r15d +; SSE41-NEXT: movd %r15d, %xmm0 ; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: movq %r11, 16(%r15) -; SSE41-NEXT: movq %rdi, (%r15) +; SSE41-NEXT: movq %r10, 16(%r12) +; SSE41-NEXT: movq %rdi, (%r12) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r13 @@ -3696,108 +3705,111 @@ ; AVX-NEXT: pushq %r13 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %r8, %r14 -; AVX-NEXT: movq %rcx, %r13 +; AVX-NEXT: movq %r8, %r15 ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: movq %rsi, %r11 ; AVX-NEXT: movq %rdi, %r10 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; AVX-NEXT: movq %r11, %rcx -; AVX-NEXT: sarq $63, %rcx -; AVX-NEXT: movq %r9, %r15 -; AVX-NEXT: imulq %rcx, %r15 -; AVX-NEXT: movq %r14, %rax -; AVX-NEXT: mulq %rcx -; AVX-NEXT: movq %rax, %rdi -; AVX-NEXT: addq %rax, %r15 -; AVX-NEXT: addq %rdx, %r15 +; AVX-NEXT: movq %r11, %rdi +; AVX-NEXT: sarq $63, %rdi +; AVX-NEXT: movq %r9, %rbx +; AVX-NEXT: imulq %rdi, %rbx +; AVX-NEXT: movq %r15, %rax +; AVX-NEXT: mulq %rdi +; AVX-NEXT: movq %rdx, %rdi +; AVX-NEXT: movq %rax, %r12 +; AVX-NEXT: addq %rax, %rdi +; AVX-NEXT: addq %rbx, %rdi ; AVX-NEXT: movq %r9, %rax ; AVX-NEXT: sarq $63, %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: imulq %r11, %rcx +; AVX-NEXT: movq %rax, %r13 +; AVX-NEXT: imulq %r11, %r13 ; AVX-NEXT: mulq %r10 -; AVX-NEXT: movq %rax, %rbx -; AVX-NEXT: addq %rax, %rcx -; AVX-NEXT: addq %rdx, %rcx -; AVX-NEXT: addq %rdi, %rbx -; AVX-NEXT: adcq %r15, %rcx +; AVX-NEXT: movq %rax, %r14 +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: addq %r13, %rbx +; AVX-NEXT: addq %rax, %rbx +; AVX-NEXT: addq %r12, %r14 +; AVX-NEXT: adcq %rdi, %rbx ; AVX-NEXT: movq %r10, %rax -; AVX-NEXT: mulq %r14 -; AVX-NEXT: movq %rdx, %r15 +; AVX-NEXT: mulq %r15 +; AVX-NEXT: movq %rdx, %r12 ; AVX-NEXT: movq %rax, %rdi ; AVX-NEXT: movq %r11, %rax -; AVX-NEXT: mulq %r14 -; AVX-NEXT: movq %rdx, %r14 -; AVX-NEXT: movq %rax, %r12 -; AVX-NEXT: addq %r15, %r12 -; AVX-NEXT: adcq $0, %r14 +; AVX-NEXT: mulq %r15 +; AVX-NEXT: movq %rdx, %r15 +; AVX-NEXT: movq %rax, %r13 +; AVX-NEXT: addq %r12, %r13 +; AVX-NEXT: adcq $0, %r15 ; AVX-NEXT: movq %r10, %rax ; AVX-NEXT: mulq %r9 -; AVX-NEXT: movq %rdx, %r15 +; AVX-NEXT: movq %rdx, %r12 ; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: addq %r12, %r10 -; AVX-NEXT: adcq %r14, %r15 +; AVX-NEXT: addq %r13, %r10 +; AVX-NEXT: adcq %r15, %r12 ; AVX-NEXT: setb %al -; AVX-NEXT: movzbl %al, %r14d +; AVX-NEXT: movzbl %al, %r15d ; AVX-NEXT: movq %r11, %rax ; AVX-NEXT: mulq %r9 -; AVX-NEXT: addq %r15, %rax -; AVX-NEXT: adcq %r14, %rdx -; AVX-NEXT: addq %rbx, %rax -; AVX-NEXT: adcq %rcx, %rdx -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX-NEXT: movq %r10, 8(%r15) +; AVX-NEXT: addq %r12, %rax +; AVX-NEXT: adcq %r15, %rdx +; AVX-NEXT: addq %r14, %rax +; AVX-NEXT: adcq %rbx, %rdx +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX-NEXT: movq %r10, 8(%r12) ; AVX-NEXT: sarq $63, %r10 ; AVX-NEXT: xorq %r10, %rdx ; AVX-NEXT: xorq %rax, %r10 -; AVX-NEXT: xorl %ecx, %ecx +; AVX-NEXT: xorl %r15d, %r15d ; AVX-NEXT: orq %rdx, %r10 -; AVX-NEXT: setne %cl -; AVX-NEXT: movq %r13, %r9 +; AVX-NEXT: setne %r15b +; AVX-NEXT: movq %rcx, %r9 ; AVX-NEXT: sarq $63, %r9 ; AVX-NEXT: movq %rbp, %r11 ; AVX-NEXT: imulq %r9, %r11 ; AVX-NEXT: movq %rsi, %rax ; AVX-NEXT: mulq %r9 -; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: addq %rax, %r11 -; AVX-NEXT: addq %rdx, %r11 +; AVX-NEXT: movq %rdx, %r9 +; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: addq %rax, %r9 +; AVX-NEXT: addq %r11, %r9 ; AVX-NEXT: movq %rbp, %rax ; AVX-NEXT: sarq $63, %rax ; AVX-NEXT: movq %rax, %r14 -; AVX-NEXT: imulq %r13, %r14 +; AVX-NEXT: imulq %rcx, %r14 ; AVX-NEXT: mulq %r8 -; AVX-NEXT: movq %rax, %r10 -; AVX-NEXT: addq %rax, %r14 -; AVX-NEXT: addq %rdx, %r14 -; AVX-NEXT: addq %r9, %r10 -; AVX-NEXT: adcq %r11, %r14 +; AVX-NEXT: movq %rax, %r11 +; AVX-NEXT: movq %rdx, %rbx +; AVX-NEXT: addq %r14, %rbx +; AVX-NEXT: addq %rax, %rbx +; AVX-NEXT: addq %r10, %r11 +; AVX-NEXT: adcq %r9, %rbx ; AVX-NEXT: movq %r8, %rax ; AVX-NEXT: mulq %rsi ; AVX-NEXT: movq %rdx, %r9 -; AVX-NEXT: movq %rax, %r11 -; AVX-NEXT: movq %r13, %rax +; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rsi ; AVX-NEXT: movq %rdx, %rsi -; AVX-NEXT: movq %rax, %rbx -; AVX-NEXT: addq %r9, %rbx +; AVX-NEXT: movq %rax, %r14 +; AVX-NEXT: addq %r9, %r14 ; AVX-NEXT: adcq $0, %rsi ; AVX-NEXT: movq %r8, %rax ; AVX-NEXT: mulq %rbp ; AVX-NEXT: movq %rdx, %r8 ; AVX-NEXT: movq %rax, %r9 -; AVX-NEXT: addq %rbx, %r9 +; AVX-NEXT: addq %r14, %r9 ; AVX-NEXT: adcq %rsi, %r8 ; AVX-NEXT: setb %al ; AVX-NEXT: movzbl %al, %esi -; AVX-NEXT: movq %r13, %rax +; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %rbp ; AVX-NEXT: addq %r8, %rax ; AVX-NEXT: adcq %rsi, %rdx -; AVX-NEXT: addq %r10, %rax -; AVX-NEXT: adcq %r14, %rdx -; AVX-NEXT: movq %r9, 24(%r15) +; AVX-NEXT: addq %r11, %rax +; AVX-NEXT: adcq %rbx, %rdx +; AVX-NEXT: movq %r9, 24(%r12) ; AVX-NEXT: sarq $63, %r9 ; AVX-NEXT: xorq %r9, %rdx ; AVX-NEXT: xorq %rax, %r9 @@ -3805,11 +3817,11 @@ ; AVX-NEXT: orq %rdx, %r9 ; AVX-NEXT: setne %al ; AVX-NEXT: negl %eax -; AVX-NEXT: negl %ecx -; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: negl %r15d +; AVX-NEXT: vmovd %r15d, %xmm0 ; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: movq %r11, 16(%r15) -; AVX-NEXT: movq %rdi, (%r15) +; AVX-NEXT: movq %r10, 16(%r12) +; AVX-NEXT: movq %rdi, (%r12) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r13 @@ -3826,104 +3838,110 @@ ; AVX512F-NEXT: pushq %r13 ; AVX512F-NEXT: pushq %r12 ; AVX512F-NEXT: pushq %rbx +; AVX512F-NEXT: movq %r9, %rbp ; AVX512F-NEXT: movq %rcx, %r11 ; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; AVX512F-NEXT: movq %rsi, %r9 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512F-NEXT: sarq $63, %rcx -; AVX512F-NEXT: movq %rbp, %r12 -; AVX512F-NEXT: imulq %rcx, %r12 -; AVX512F-NEXT: movq %r14, %rax +; AVX512F-NEXT: movq %rsi, %rbx +; AVX512F-NEXT: imulq %rcx, %rbx +; AVX512F-NEXT: movq %r15, %rax ; AVX512F-NEXT: mulq %rcx -; AVX512F-NEXT: movq %rax, %r15 -; AVX512F-NEXT: addq %rax, %r12 -; AVX512F-NEXT: addq %rdx, %r12 -; AVX512F-NEXT: movq %rbp, %rax +; AVX512F-NEXT: movq %rdx, %rcx +; AVX512F-NEXT: movq %rax, %r12 +; AVX512F-NEXT: addq %rax, %rcx +; AVX512F-NEXT: addq %rbx, %rcx +; AVX512F-NEXT: movq %rsi, %rax ; AVX512F-NEXT: sarq $63, %rax -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: imulq %r11, %rcx +; AVX512F-NEXT: movq %rax, %r13 +; AVX512F-NEXT: imulq %r11, %r13 ; AVX512F-NEXT: mulq %r10 -; AVX512F-NEXT: movq %rax, %rbx -; AVX512F-NEXT: addq %rax, %rcx -; AVX512F-NEXT: addq %rdx, %rcx -; AVX512F-NEXT: addq %r15, %rbx -; AVX512F-NEXT: adcq %r12, %rcx +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: movq %rdx, %rbx +; AVX512F-NEXT: addq %r13, %rbx +; AVX512F-NEXT: addq %rax, %rbx +; AVX512F-NEXT: addq %r12, %r14 +; AVX512F-NEXT: adcq %rcx, %rbx ; AVX512F-NEXT: movq %r10, %rax -; AVX512F-NEXT: mulq %r14 -; AVX512F-NEXT: movq %rdx, %r15 -; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: mulq %r15 +; AVX512F-NEXT: movq %rdx, %r12 +; AVX512F-NEXT: movq %rax, %rcx ; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %r14 -; AVX512F-NEXT: movq %rdx, %r14 -; AVX512F-NEXT: movq %rax, %r12 -; AVX512F-NEXT: addq %r15, %r12 -; AVX512F-NEXT: adcq $0, %r14 -; AVX512F-NEXT: movq %r10, %rax -; AVX512F-NEXT: mulq %rbp +; AVX512F-NEXT: mulq %r15 ; AVX512F-NEXT: movq %rdx, %r15 +; AVX512F-NEXT: movq %rax, %r13 +; AVX512F-NEXT: addq %r12, %r13 +; AVX512F-NEXT: adcq $0, %r15 +; AVX512F-NEXT: movq %r10, %rax +; AVX512F-NEXT: mulq %rsi +; AVX512F-NEXT: movq %rdx, %r12 ; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: addq %r12, %r10 -; AVX512F-NEXT: adcq %r14, %r15 +; AVX512F-NEXT: addq %r13, %r10 +; AVX512F-NEXT: adcq %r15, %r12 ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: movzbl %al, %r14d +; AVX512F-NEXT: movzbl %al, %r15d ; AVX512F-NEXT: movq %r11, %rax -; AVX512F-NEXT: mulq %rbp -; AVX512F-NEXT: addq %r15, %rax -; AVX512F-NEXT: adcq %r14, %rdx -; AVX512F-NEXT: addq %rbx, %rax -; AVX512F-NEXT: adcq %rcx, %rdx -; AVX512F-NEXT: movq %r10, 24(%r13) +; AVX512F-NEXT: mulq %rsi +; AVX512F-NEXT: addq %r12, %rax +; AVX512F-NEXT: adcq %r15, %rdx +; AVX512F-NEXT: addq %r14, %rax +; AVX512F-NEXT: adcq %rbx, %rdx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512F-NEXT: movq %r10, 24(%r12) ; AVX512F-NEXT: sarq $63, %r10 ; AVX512F-NEXT: xorq %r10, %rdx ; AVX512F-NEXT: xorq %rax, %r10 ; AVX512F-NEXT: orq %rdx, %r10 ; AVX512F-NEXT: setne %al ; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: movq %rsi, %rcx -; AVX512F-NEXT: sarq $63, %rcx -; AVX512F-NEXT: movq %r9, %rbx -; AVX512F-NEXT: imulq %rcx, %rbx +; AVX512F-NEXT: movq %r9, %rsi +; AVX512F-NEXT: sarq $63, %rsi +; AVX512F-NEXT: movq %rbp, %rbx +; AVX512F-NEXT: imulq %rsi, %rbx ; AVX512F-NEXT: movq %r8, %rax -; AVX512F-NEXT: mulq %rcx -; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: addq %rax, %rbx -; AVX512F-NEXT: addq %rdx, %rbx -; AVX512F-NEXT: movq %r9, %rax +; AVX512F-NEXT: mulq %rsi +; AVX512F-NEXT: movq %rdx, %r10 +; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: addq %rax, %r10 +; AVX512F-NEXT: addq %rbx, %r10 +; AVX512F-NEXT: movq %rbp, %rax ; AVX512F-NEXT: sarq $63, %rax -; AVX512F-NEXT: movq %rax, %rcx -; AVX512F-NEXT: imulq %rsi, %rcx +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: imulq %r9, %rsi ; AVX512F-NEXT: mulq %rdi -; AVX512F-NEXT: movq %rax, %r11 -; AVX512F-NEXT: addq %rax, %rcx -; AVX512F-NEXT: addq %rdx, %rcx -; AVX512F-NEXT: addq %r10, %r11 -; AVX512F-NEXT: adcq %rbx, %rcx +; AVX512F-NEXT: movq %rax, %rbx +; AVX512F-NEXT: movq %rdx, %r14 +; AVX512F-NEXT: addq %rsi, %r14 +; AVX512F-NEXT: addq %rax, %r14 +; AVX512F-NEXT: addq %r11, %rbx +; AVX512F-NEXT: adcq %r10, %r14 ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %r10 -; AVX512F-NEXT: movq %rax, %rbx -; AVX512F-NEXT: movq %rsi, %rax +; AVX512F-NEXT: movq %rax, %r11 +; AVX512F-NEXT: movq %r9, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: movq %rdx, %r8 -; AVX512F-NEXT: movq %rax, %r14 -; AVX512F-NEXT: addq %r10, %r14 +; AVX512F-NEXT: movq %rax, %r15 +; AVX512F-NEXT: addq %r10, %r15 ; AVX512F-NEXT: adcq $0, %r8 ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: mulq %r9 +; AVX512F-NEXT: mulq %rbp ; AVX512F-NEXT: movq %rdx, %rdi ; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: addq %r14, %r10 +; AVX512F-NEXT: addq %r15, %r10 ; AVX512F-NEXT: adcq %r8, %rdi ; AVX512F-NEXT: setb %al -; AVX512F-NEXT: movzbl %al, %r8d -; AVX512F-NEXT: movq %rsi, %rax -; AVX512F-NEXT: mulq %r9 +; AVX512F-NEXT: movzbl %al, %esi +; AVX512F-NEXT: movq %r9, %rax +; AVX512F-NEXT: mulq %rbp ; AVX512F-NEXT: addq %rdi, %rax -; AVX512F-NEXT: adcq %r8, %rdx -; AVX512F-NEXT: addq %r11, %rax -; AVX512F-NEXT: adcq %rcx, %rdx -; AVX512F-NEXT: movq %r10, 8(%r13) +; AVX512F-NEXT: adcq %rsi, %rdx +; AVX512F-NEXT: addq %rbx, %rax +; AVX512F-NEXT: adcq %r14, %rdx +; AVX512F-NEXT: movq %r10, 8(%r12) ; AVX512F-NEXT: sarq $63, %r10 ; AVX512F-NEXT: xorq %r10, %rdx ; AVX512F-NEXT: xorq %rax, %r10 @@ -3935,9 +3953,8 @@ ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512F-NEXT: movq %rax, 16(%r13) -; AVX512F-NEXT: movq %rbx, (%r13) +; AVX512F-NEXT: movq %rcx, 16(%r12) +; AVX512F-NEXT: movq %r11, (%r12) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r13 @@ -3954,104 +3971,110 @@ ; AVX512BW-NEXT: pushq %r13 ; AVX512BW-NEXT: pushq %r12 ; AVX512BW-NEXT: pushq %rbx +; AVX512BW-NEXT: movq %r9, %rbp ; AVX512BW-NEXT: movq %rcx, %r11 ; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; AVX512BW-NEXT: movq %rsi, %r9 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX512BW-NEXT: sarq $63, %rcx -; AVX512BW-NEXT: movq %rbp, %r12 -; AVX512BW-NEXT: imulq %rcx, %r12 -; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: movq %rsi, %rbx +; AVX512BW-NEXT: imulq %rcx, %rbx +; AVX512BW-NEXT: movq %r15, %rax ; AVX512BW-NEXT: mulq %rcx -; AVX512BW-NEXT: movq %rax, %r15 -; AVX512BW-NEXT: addq %rax, %r12 -; AVX512BW-NEXT: addq %rdx, %r12 -; AVX512BW-NEXT: movq %rbp, %rax +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: addq %rax, %rcx +; AVX512BW-NEXT: addq %rbx, %rcx +; AVX512BW-NEXT: movq %rsi, %rax ; AVX512BW-NEXT: sarq $63, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: imulq %r11, %rcx +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: imulq %r11, %r13 ; AVX512BW-NEXT: mulq %r10 -; AVX512BW-NEXT: movq %rax, %rbx -; AVX512BW-NEXT: addq %rax, %rcx -; AVX512BW-NEXT: addq %rdx, %rcx -; AVX512BW-NEXT: addq %r15, %rbx -; AVX512BW-NEXT: adcq %r12, %rcx +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: movq %rdx, %rbx +; AVX512BW-NEXT: addq %r13, %rbx +; AVX512BW-NEXT: addq %rax, %rbx +; AVX512BW-NEXT: addq %r12, %r14 +; AVX512BW-NEXT: adcq %rcx, %rbx ; AVX512BW-NEXT: movq %r10, %rax -; AVX512BW-NEXT: mulq %r14 -; AVX512BW-NEXT: movq %rdx, %r15 -; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: mulq %r15 +; AVX512BW-NEXT: movq %rdx, %r12 +; AVX512BW-NEXT: movq %rax, %rcx ; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %r14 -; AVX512BW-NEXT: movq %rdx, %r14 -; AVX512BW-NEXT: movq %rax, %r12 -; AVX512BW-NEXT: addq %r15, %r12 -; AVX512BW-NEXT: adcq $0, %r14 -; AVX512BW-NEXT: movq %r10, %rax -; AVX512BW-NEXT: mulq %rbp +; AVX512BW-NEXT: mulq %r15 ; AVX512BW-NEXT: movq %rdx, %r15 +; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: addq %r12, %r13 +; AVX512BW-NEXT: adcq $0, %r15 +; AVX512BW-NEXT: movq %r10, %rax +; AVX512BW-NEXT: mulq %rsi +; AVX512BW-NEXT: movq %rdx, %r12 ; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: addq %r12, %r10 -; AVX512BW-NEXT: adcq %r14, %r15 +; AVX512BW-NEXT: addq %r13, %r10 +; AVX512BW-NEXT: adcq %r15, %r12 ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: movzbl %al, %r14d +; AVX512BW-NEXT: movzbl %al, %r15d ; AVX512BW-NEXT: movq %r11, %rax -; AVX512BW-NEXT: mulq %rbp -; AVX512BW-NEXT: addq %r15, %rax -; AVX512BW-NEXT: adcq %r14, %rdx -; AVX512BW-NEXT: addq %rbx, %rax -; AVX512BW-NEXT: adcq %rcx, %rdx -; AVX512BW-NEXT: movq %r10, 24(%r13) +; AVX512BW-NEXT: mulq %rsi +; AVX512BW-NEXT: addq %r12, %rax +; AVX512BW-NEXT: adcq %r15, %rdx +; AVX512BW-NEXT: addq %r14, %rax +; AVX512BW-NEXT: adcq %rbx, %rdx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; AVX512BW-NEXT: movq %r10, 24(%r12) ; AVX512BW-NEXT: sarq $63, %r10 ; AVX512BW-NEXT: xorq %r10, %rdx ; AVX512BW-NEXT: xorq %rax, %r10 ; AVX512BW-NEXT: orq %rdx, %r10 ; AVX512BW-NEXT: setne %al ; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: movq %rsi, %rcx -; AVX512BW-NEXT: sarq $63, %rcx -; AVX512BW-NEXT: movq %r9, %rbx -; AVX512BW-NEXT: imulq %rcx, %rbx +; AVX512BW-NEXT: movq %r9, %rsi +; AVX512BW-NEXT: sarq $63, %rsi +; AVX512BW-NEXT: movq %rbp, %rbx +; AVX512BW-NEXT: imulq %rsi, %rbx ; AVX512BW-NEXT: movq %r8, %rax -; AVX512BW-NEXT: mulq %rcx -; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: addq %rax, %rbx -; AVX512BW-NEXT: addq %rdx, %rbx -; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: mulq %rsi +; AVX512BW-NEXT: movq %rdx, %r10 +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: addq %rax, %r10 +; AVX512BW-NEXT: addq %rbx, %r10 +; AVX512BW-NEXT: movq %rbp, %rax ; AVX512BW-NEXT: sarq $63, %rax -; AVX512BW-NEXT: movq %rax, %rcx -; AVX512BW-NEXT: imulq %rsi, %rcx +; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: imulq %r9, %rsi ; AVX512BW-NEXT: mulq %rdi -; AVX512BW-NEXT: movq %rax, %r11 -; AVX512BW-NEXT: addq %rax, %rcx -; AVX512BW-NEXT: addq %rdx, %rcx -; AVX512BW-NEXT: addq %r10, %r11 -; AVX512BW-NEXT: adcq %rbx, %rcx +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: movq %rdx, %r14 +; AVX512BW-NEXT: addq %rsi, %r14 +; AVX512BW-NEXT: addq %rax, %r14 +; AVX512BW-NEXT: addq %r11, %rbx +; AVX512BW-NEXT: adcq %r10, %r14 ; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %r10 -; AVX512BW-NEXT: movq %rax, %rbx -; AVX512BW-NEXT: movq %rsi, %rax +; AVX512BW-NEXT: movq %rax, %r11 +; AVX512BW-NEXT: movq %r9, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: movq %rdx, %r8 -; AVX512BW-NEXT: movq %rax, %r14 -; AVX512BW-NEXT: addq %r10, %r14 +; AVX512BW-NEXT: movq %rax, %r15 +; AVX512BW-NEXT: addq %r10, %r15 ; AVX512BW-NEXT: adcq $0, %r8 ; AVX512BW-NEXT: movq %rdi, %rax -; AVX512BW-NEXT: mulq %r9 +; AVX512BW-NEXT: mulq %rbp ; AVX512BW-NEXT: movq %rdx, %rdi ; AVX512BW-NEXT: movq %rax, %r10 -; AVX512BW-NEXT: addq %r14, %r10 +; AVX512BW-NEXT: addq %r15, %r10 ; AVX512BW-NEXT: adcq %r8, %rdi ; AVX512BW-NEXT: setb %al -; AVX512BW-NEXT: movzbl %al, %r8d -; AVX512BW-NEXT: movq %rsi, %rax -; AVX512BW-NEXT: mulq %r9 +; AVX512BW-NEXT: movzbl %al, %esi +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: mulq %rbp ; AVX512BW-NEXT: addq %rdi, %rax -; AVX512BW-NEXT: adcq %r8, %rdx -; AVX512BW-NEXT: addq %r11, %rax -; AVX512BW-NEXT: adcq %rcx, %rdx -; AVX512BW-NEXT: movq %r10, 8(%r13) +; AVX512BW-NEXT: adcq %rsi, %rdx +; AVX512BW-NEXT: addq %rbx, %rax +; AVX512BW-NEXT: adcq %r14, %rdx +; AVX512BW-NEXT: movq %r10, 8(%r12) ; AVX512BW-NEXT: sarq $63, %r10 ; AVX512BW-NEXT: xorq %r10, %rdx ; AVX512BW-NEXT: xorq %rax, %r10 @@ -4063,9 +4086,8 @@ ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512BW-NEXT: movq %rax, 16(%r13) -; AVX512BW-NEXT: movq %rbx, (%r13) +; AVX512BW-NEXT: movq %rcx, 16(%r12) +; AVX512BW-NEXT: movq %r11, (%r12) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 ; AVX512BW-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -2932,6 +2932,7 @@ ; SSE2-NEXT: mulq %rdi ; SSE2-NEXT: seto %r12b ; SSE2-NEXT: orb %r15b, %r12b +; SSE2-NEXT: orb %bpl, %r12b ; SSE2-NEXT: leaq (%rsi,%rax), %r10 ; SSE2-NEXT: movq %rdi, %rax ; SSE2-NEXT: mulq %r8 @@ -2940,7 +2941,6 @@ ; SSE2-NEXT: addq %r10, %rsi ; SSE2-NEXT: setb %r10b ; SSE2-NEXT: orb %r12b, %r10b -; SSE2-NEXT: orb %bpl, %r10b ; SSE2-NEXT: testq %r9, %r9 ; SSE2-NEXT: setne %al ; SSE2-NEXT: testq %r11, %r11 @@ -2954,13 +2954,13 @@ ; SSE2-NEXT: mulq %rcx ; SSE2-NEXT: seto %r9b ; SSE2-NEXT: orb %r11b, %r9b +; SSE2-NEXT: orb %bpl, %r9b ; SSE2-NEXT: addq %rax, %r8 ; SSE2-NEXT: movq %rcx, %rax ; SSE2-NEXT: mulq %r14 ; SSE2-NEXT: addq %r8, %rdx ; SSE2-NEXT: setb %cl ; SSE2-NEXT: orb %r9b, %cl -; SSE2-NEXT: orb %bpl, %cl ; SSE2-NEXT: movzbl %cl, %ecx ; SSE2-NEXT: negl %ecx ; SSE2-NEXT: movd %ecx, %xmm1 @@ -3005,6 +3005,7 @@ ; SSSE3-NEXT: mulq %rdi ; SSSE3-NEXT: seto %r12b ; SSSE3-NEXT: orb %r15b, %r12b +; SSSE3-NEXT: orb %bpl, %r12b ; SSSE3-NEXT: leaq (%rsi,%rax), %r10 ; SSSE3-NEXT: movq %rdi, %rax ; SSSE3-NEXT: mulq %r8 @@ -3013,7 +3014,6 @@ ; SSSE3-NEXT: addq %r10, %rsi ; SSSE3-NEXT: setb %r10b ; SSSE3-NEXT: orb %r12b, %r10b -; SSSE3-NEXT: orb %bpl, %r10b ; SSSE3-NEXT: testq %r9, %r9 ; SSSE3-NEXT: setne %al ; SSSE3-NEXT: testq %r11, %r11 @@ -3027,13 +3027,13 @@ ; SSSE3-NEXT: mulq %rcx ; SSSE3-NEXT: seto %r9b ; SSSE3-NEXT: orb %r11b, %r9b +; SSSE3-NEXT: orb %bpl, %r9b ; SSSE3-NEXT: addq %rax, %r8 ; SSSE3-NEXT: movq %rcx, %rax ; SSSE3-NEXT: mulq %r14 ; SSSE3-NEXT: addq %r8, %rdx ; SSSE3-NEXT: setb %cl ; SSSE3-NEXT: orb %r9b, %cl -; SSSE3-NEXT: orb %bpl, %cl ; SSSE3-NEXT: movzbl %cl, %ecx ; SSSE3-NEXT: negl %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 @@ -3078,6 +3078,7 @@ ; SSE41-NEXT: mulq %rdi ; SSE41-NEXT: seto %r12b ; SSE41-NEXT: orb %r15b, %r12b +; SSE41-NEXT: orb %bpl, %r12b ; SSE41-NEXT: leaq (%rsi,%rax), %r10 ; SSE41-NEXT: movq %rdi, %rax ; SSE41-NEXT: mulq %r8 @@ -3086,7 +3087,6 @@ ; SSE41-NEXT: addq %r10, %rsi ; SSE41-NEXT: setb %r10b ; SSE41-NEXT: orb %r12b, %r10b -; SSE41-NEXT: orb %bpl, %r10b ; SSE41-NEXT: testq %r9, %r9 ; SSE41-NEXT: setne %al ; SSE41-NEXT: testq %r11, %r11 @@ -3100,13 +3100,13 @@ ; SSE41-NEXT: mulq %rcx ; SSE41-NEXT: seto %r9b ; SSE41-NEXT: orb %r11b, %r9b +; SSE41-NEXT: orb %bpl, %r9b ; SSE41-NEXT: addq %rax, %r8 ; SSE41-NEXT: movq %rcx, %rax ; SSE41-NEXT: mulq %r14 ; SSE41-NEXT: addq %r8, %rdx ; SSE41-NEXT: setb %cl ; SSE41-NEXT: orb %r9b, %cl -; SSE41-NEXT: orb %bpl, %cl ; SSE41-NEXT: movzbl %cl, %ecx ; SSE41-NEXT: negl %ecx ; SSE41-NEXT: movzbl %r10b, %r8d @@ -3150,6 +3150,7 @@ ; AVX-NEXT: mulq %rdi ; AVX-NEXT: seto %r12b ; AVX-NEXT: orb %r15b, %r12b +; AVX-NEXT: orb %bpl, %r12b ; AVX-NEXT: leaq (%rsi,%rax), %r10 ; AVX-NEXT: movq %rdi, %rax ; AVX-NEXT: mulq %r8 @@ -3158,7 +3159,6 @@ ; AVX-NEXT: addq %r10, %rsi ; AVX-NEXT: setb %r10b ; AVX-NEXT: orb %r12b, %r10b -; AVX-NEXT: orb %bpl, %r10b ; AVX-NEXT: testq %r9, %r9 ; AVX-NEXT: setne %al ; AVX-NEXT: testq %r11, %r11 @@ -3172,13 +3172,13 @@ ; AVX-NEXT: mulq %rcx ; AVX-NEXT: seto %r9b ; AVX-NEXT: orb %r11b, %r9b +; AVX-NEXT: orb %bpl, %r9b ; AVX-NEXT: addq %rax, %r8 ; AVX-NEXT: movq %rcx, %rax ; AVX-NEXT: mulq %r14 ; AVX-NEXT: addq %r8, %rdx ; AVX-NEXT: setb %cl ; AVX-NEXT: orb %r9b, %cl -; AVX-NEXT: orb %bpl, %cl ; AVX-NEXT: movzbl %cl, %ecx ; AVX-NEXT: negl %ecx ; AVX-NEXT: movzbl %r10b, %r8d @@ -3221,6 +3221,7 @@ ; AVX512F-NEXT: mulq %rcx ; AVX512F-NEXT: seto %r12b ; AVX512F-NEXT: orb %r15b, %r12b +; AVX512F-NEXT: orb %bpl, %r12b ; AVX512F-NEXT: addq %rax, %r11 ; AVX512F-NEXT: movq %rcx, %rax ; AVX512F-NEXT: mulq %r14 @@ -3229,7 +3230,6 @@ ; AVX512F-NEXT: addq %r11, %rcx ; AVX512F-NEXT: setb %al ; AVX512F-NEXT: orb %r12b, %al -; AVX512F-NEXT: orb %bpl, %al ; AVX512F-NEXT: kmovw %eax, %k0 ; AVX512F-NEXT: testq %r9, %r9 ; AVX512F-NEXT: setne %al @@ -3244,13 +3244,13 @@ ; AVX512F-NEXT: mulq %rdi ; AVX512F-NEXT: seto %r9b ; AVX512F-NEXT: orb %bpl, %r9b +; AVX512F-NEXT: orb %r11b, %r9b ; AVX512F-NEXT: addq %rax, %r10 ; AVX512F-NEXT: movq %rdi, %rax ; AVX512F-NEXT: mulq %r8 ; AVX512F-NEXT: addq %r10, %rdx ; AVX512F-NEXT: setb %dil ; AVX512F-NEXT: orb %r9b, %dil -; AVX512F-NEXT: orb %r11b, %dil ; AVX512F-NEXT: andl $1, %edi ; AVX512F-NEXT: kmovw %edi, %k1 ; AVX512F-NEXT: kshiftlw $1, %k0, %k0 @@ -3293,6 +3293,7 @@ ; AVX512BW-NEXT: mulq %rcx ; AVX512BW-NEXT: seto %r12b ; AVX512BW-NEXT: orb %r15b, %r12b +; AVX512BW-NEXT: orb %bpl, %r12b ; AVX512BW-NEXT: addq %rax, %r11 ; AVX512BW-NEXT: movq %rcx, %rax ; AVX512BW-NEXT: mulq %r14 @@ -3301,7 +3302,6 @@ ; AVX512BW-NEXT: addq %r11, %rcx ; AVX512BW-NEXT: setb %al ; AVX512BW-NEXT: orb %r12b, %al -; AVX512BW-NEXT: orb %bpl, %al ; AVX512BW-NEXT: kmovd %eax, %k0 ; AVX512BW-NEXT: testq %r9, %r9 ; AVX512BW-NEXT: setne %al @@ -3316,13 +3316,13 @@ ; AVX512BW-NEXT: mulq %rdi ; AVX512BW-NEXT: seto %r9b ; AVX512BW-NEXT: orb %bpl, %r9b +; AVX512BW-NEXT: orb %r11b, %r9b ; AVX512BW-NEXT: addq %rax, %r10 ; AVX512BW-NEXT: movq %rdi, %rax ; AVX512BW-NEXT: mulq %r8 ; AVX512BW-NEXT: addq %r10, %rdx ; AVX512BW-NEXT: setb %dil ; AVX512BW-NEXT: orb %r9b, %dil -; AVX512BW-NEXT: orb %r11b, %dil ; AVX512BW-NEXT: andl $1, %edi ; AVX512BW-NEXT: kmovw %edi, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -457,25 +457,25 @@ ; SSE2-NEXT: pandn %xmm3, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $23, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm5, %xmm4 -; SSE2-NEXT: cvttps2dq %xmm4, %xmm4 -; SSE2-NEXT: pslld $16, %xmm4 -; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $23, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: paddd %xmm4, %xmm2 ; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm4, %xmm2 +; SSE2-NEXT: packssdw %xmm1, %xmm2 ; SSE2-NEXT: paddw %xmm0, %xmm0 ; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; @@ -687,25 +687,25 @@ ; X86-SSE2-NEXT: pandn %xmm3, %xmm1 ; X86-SSE2-NEXT: psrlw $1, %xmm3 ; X86-SSE2-NEXT: pand %xmm4, %xmm3 +; X86-SSE2-NEXT: por %xmm1, %xmm3 ; X86-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 -; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; X86-SSE2-NEXT: pslld $23, %xmm4 -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216] -; X86-SSE2-NEXT: paddd %xmm5, %xmm4 -; X86-SSE2-NEXT: cvttps2dq %xmm4, %xmm4 -; X86-SSE2-NEXT: pslld $16, %xmm4 -; X86-SSE2-NEXT: psrad $16, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm1 +; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; X86-SSE2-NEXT: pslld $23, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] +; X86-SSE2-NEXT: paddd %xmm4, %xmm1 +; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 +; X86-SSE2-NEXT: pslld $16, %xmm1 +; X86-SSE2-NEXT: psrad $16, %xmm1 ; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; X86-SSE2-NEXT: pslld $23, %xmm2 -; X86-SSE2-NEXT: paddd %xmm5, %xmm2 +; X86-SSE2-NEXT: paddd %xmm4, %xmm2 ; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 -; X86-SSE2-NEXT: packssdw %xmm4, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm1, %xmm2 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: pmullw %xmm2, %xmm0 -; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) @@ -719,33 +719,33 @@ ; SSE2-NEXT: movdqa %xmm2, %xmm6 ; SSE2-NEXT: pand %xmm5, %xmm6 ; SSE2-NEXT: psllw $5, %xmm6 -; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtb %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: pandn %xmm1, %xmm7 ; SSE2-NEXT: psrlw $4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: por %xmm7, %xmm3 ; SSE2-NEXT: paddb %xmm6, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pandn %xmm4, %xmm7 -; SSE2-NEXT: psrlw $2, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm7, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm7 +; SSE2-NEXT: psrlw $2, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: por %xmm7, %xmm3 ; SSE2-NEXT: paddb %xmm6, %xmm6 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pandn %xmm4, %xmm6 -; SSE2-NEXT: psrlw $1, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; SSE2-NEXT: por %xmm6, %xmm4 +; SSE2-NEXT: pandn %xmm3, %xmm6 +; SSE2-NEXT: psrlw $1, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: por %xmm6, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm2 ; SSE2-NEXT: psllw $5, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 @@ -767,13 +767,13 @@ ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 ; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: var_funnnel_v16i8: @@ -964,33 +964,33 @@ ; X86-SSE2-NEXT: movdqa %xmm2, %xmm6 ; X86-SSE2-NEXT: pand %xmm5, %xmm6 ; X86-SSE2-NEXT: psllw $5, %xmm6 -; X86-SSE2-NEXT: pxor %xmm3, %xmm3 ; X86-SSE2-NEXT: pxor %xmm4, %xmm4 -; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm4 -; X86-SSE2-NEXT: movdqa %xmm4, %xmm7 +; X86-SSE2-NEXT: pxor %xmm3, %xmm3 +; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 ; X86-SSE2-NEXT: pandn %xmm1, %xmm7 ; X86-SSE2-NEXT: psrlw $4, %xmm1 -; X86-SSE2-NEXT: pand %xmm1, %xmm4 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 -; X86-SSE2-NEXT: por %xmm7, %xmm4 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: por %xmm7, %xmm3 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm7 -; X86-SSE2-NEXT: pandn %xmm4, %xmm7 -; X86-SSE2-NEXT: psrlw $2, %xmm4 -; X86-SSE2-NEXT: pand %xmm1, %xmm4 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 -; X86-SSE2-NEXT: por %xmm7, %xmm4 +; X86-SSE2-NEXT: pandn %xmm3, %xmm7 +; X86-SSE2-NEXT: psrlw $2, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: por %xmm7, %xmm3 ; X86-SSE2-NEXT: paddb %xmm6, %xmm6 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 ; X86-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 ; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 -; X86-SSE2-NEXT: pandn %xmm4, %xmm6 -; X86-SSE2-NEXT: psrlw $1, %xmm4 -; X86-SSE2-NEXT: pand %xmm1, %xmm4 -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 -; X86-SSE2-NEXT: por %xmm6, %xmm4 +; X86-SSE2-NEXT: pandn %xmm3, %xmm6 +; X86-SSE2-NEXT: psrlw $1, %xmm3 +; X86-SSE2-NEXT: pand %xmm1, %xmm3 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 +; X86-SSE2-NEXT: por %xmm6, %xmm3 ; X86-SSE2-NEXT: pandn %xmm5, %xmm2 ; X86-SSE2-NEXT: psllw $5, %xmm2 ; X86-SSE2-NEXT: pxor %xmm1, %xmm1 @@ -1012,13 +1012,13 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: por %xmm5, %xmm0 ; X86-SSE2-NEXT: paddb %xmm2, %xmm2 -; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm3 -; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pcmpgtb %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm1 ; X86-SSE2-NEXT: pandn %xmm0, %xmm1 -; X86-SSE2-NEXT: por %xmm4, %xmm1 ; X86-SSE2-NEXT: paddb %xmm0, %xmm0 -; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 ; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: por %xmm3, %xmm0 ; X86-SSE2-NEXT: retl %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) ret <16 x i8> %res @@ -1761,10 +1761,10 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: paddw %xmm0, %xmm0 ; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: constant_funnnel_v8i16: @@ -1853,10 +1853,10 @@ ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] ; X86-SSE2-NEXT: pandn %xmm1, %xmm2 ; X86-SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-SSE2-NEXT: por %xmm1, %xmm2 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0 ; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: por %xmm2, %xmm0 -; X86-SSE2-NEXT: por %xmm1, %xmm0 ; X86-SSE2-NEXT: retl %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> ) ret <8 x i16> %res diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -660,16 +660,16 @@ ; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm11[1] ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535] ; SSE-NEXT: pandn %xmm13, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm12 +; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: por %xmm11, %xmm12 ; SSE-NEXT: psrld $16, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; SSE-NEXT: movdqa %xmm4, %xmm11 +; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm13 -; SSE-NEXT: pandn %xmm12, %xmm13 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: por %xmm11, %xmm13 -; SSE-NEXT: por %xmm9, %xmm13 +; SSE-NEXT: pandn %xmm11, %xmm13 +; SSE-NEXT: por %xmm12, %xmm13 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: pand %xmm9, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[3,3,3,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -724,51 +724,51 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride5_vf16: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm3 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm0 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[6],zero,xmm3[u,u,u,7],zero,xmm3[u,u,u,8],zero,xmm3[u,u,u,9] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero,xmm2[9,u] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9],zero,xmm1[u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u] ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255> ; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[10,11],zero,zero,zero,xmm6[12,13],zero,zero,zero,xmm6[14,15],zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[10,11],zero,zero,zero,xmm7[12,13],zero,zero,zero,xmm7[14,15],zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm9[0,1],zero,zero,zero,xmm9[2,3],zero,zero,zero,xmm9[4,5],zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1],zero,zero,zero,xmm11[2,3],zero,zero,zero,xmm11[4,5],zero,zero,zero,xmm11[6] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero -; AVX1-ONLY-NEXT: vpor %xmm12, %xmm10, %xmm10 -; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6,7],zero,zero,zero,xmm9[8,9],zero,zero,zero,xmm9[10,11],zero,zero,zero -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6],zero,zero,zero,xmm1[9,8],zero,zero,zero,xmm1[11,10],zero,zero,zero,xmm1[13,12] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm9, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2],zero,zero,zero,xmm3[5,4],zero,zero,zero,xmm3[7,6],zero,zero,zero,xmm3[9,8] -; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero -; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, 48(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm1, 16(%r9) -; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%r9) +; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm8[0,1],zero,zero,zero,xmm8[2,3],zero,zero,zero,xmm8[4,5],zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1],zero,zero,zero,xmm10[2,3],zero,zero,zero,xmm10[4,5],zero,zero,zero,xmm10[6] +; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero +; AVX1-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6,7],zero,zero,zero,xmm8[8,9],zero,zero,zero,xmm8[10,11],zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,zero,zero,xmm2[9,8],zero,zero,zero,xmm2[11,10],zero,zero,zero,xmm2[13,12] +; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,xmm1[5,4],zero,zero,zero,xmm1[7,6],zero,zero,zero,xmm1[9,8] +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero +; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, 48(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm2, 16(%r9) +; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%r9) ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm0 -; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%r9) ; AVX1-ONLY-NEXT: retq @@ -809,8 +809,8 @@ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero ; AVX2-SLOW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, (%r9) @@ -851,8 +851,8 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero ; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero +; AVX2-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%r9) @@ -896,8 +896,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero ; AVX2-FAST-PERLANE-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, 64(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, (%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -428,14 +428,14 @@ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm2, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpor %ymm6, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero ; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) @@ -508,14 +508,14 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[0,8],zero,zero,zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm5[0,8],zero,zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,ymm5[18,26],zero,zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,12],zero,zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,30],zero,zero,zero,zero,zero,zero,ymm1[23,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm3[4,12],zero,zero,zero,zero,zero,zero,ymm3[5,13],zero,zero,zero,zero,ymm3[22,30],zero,zero,zero,zero,zero,zero,ymm3[23,31],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,12],zero,zero,zero,zero,zero,zero,ymm0[5,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[22,30],zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm5[4,12],zero,zero,zero,zero,zero,zero,ymm5[5,13],zero,zero,zero,zero,ymm5[22,30],zero,zero,zero,zero,zero,zero,ymm5[23,31],zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpor %ymm3, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll --- a/llvm/test/CodeGen/X86/vector-pcmp.ll +++ b/llvm/test/CodeGen/X86/vector-pcmp.ll @@ -322,12 +322,12 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1] -; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,1] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,0,3,2] +; SSE2-NEXT: pand %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -114,10 +114,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: psrlq $60, %xmm2 ; SSE2-NEXT: psrlq $60, %xmm0 +; SSE2-NEXT: paddq %xmm2, %xmm0 ; SSE2-NEXT: psrlq $60, %xmm3 ; SSE2-NEXT: psrlq $60, %xmm1 ; SSE2-NEXT: paddq %xmm3, %xmm1 -; SSE2-NEXT: paddq %xmm2, %xmm1 ; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddq %xmm1, %xmm0 @@ -128,10 +128,10 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: psrlq $60, %xmm2 ; SSE41-NEXT: psrlq $60, %xmm0 +; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: psrlq $60, %xmm3 ; SSE41-NEXT: psrlq $60, %xmm1 ; SSE41-NEXT: paddq %xmm3, %xmm1 -; SSE41-NEXT: paddq %xmm2, %xmm1 ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: paddq %xmm1, %xmm0 @@ -142,13 +142,13 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm2 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpsrlq $60, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -188,19 +188,19 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [1,1] ; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: paddq %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pand %xmm8, %xmm3 ; SSE2-NEXT: paddq %xmm7, %xmm3 -; SSE2-NEXT: paddq %xmm5, %xmm3 ; SSE2-NEXT: paddq %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm8, %xmm4 ; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: paddq %xmm6, %xmm2 -; SSE2-NEXT: paddq %xmm4, %xmm2 -; SSE2-NEXT: paddq %xmm3, %xmm2 ; SSE2-NEXT: paddq %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm3, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddq %xmm2, %xmm0 ; SSE2-NEXT: movq %xmm0, %rax @@ -211,19 +211,19 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [1,1] ; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: paddq %xmm5, %xmm1 ; SSE41-NEXT: pand %xmm8, %xmm7 ; SSE41-NEXT: pand %xmm8, %xmm3 ; SSE41-NEXT: paddq %xmm7, %xmm3 -; SSE41-NEXT: paddq %xmm5, %xmm3 ; SSE41-NEXT: paddq %xmm1, %xmm3 ; SSE41-NEXT: pand %xmm8, %xmm4 ; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: paddq %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm6 ; SSE41-NEXT: pand %xmm8, %xmm2 ; SSE41-NEXT: paddq %xmm6, %xmm2 -; SSE41-NEXT: paddq %xmm4, %xmm2 -; SSE41-NEXT: paddq %xmm3, %xmm2 ; SSE41-NEXT: paddq %xmm0, %xmm2 +; SSE41-NEXT: paddq %xmm3, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: movq %xmm0, %rax @@ -237,16 +237,16 @@ ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -258,10 +258,10 @@ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -508,10 +508,10 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE2-NEXT: pand %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -525,10 +525,10 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm4, %xmm2 ; SSE41-NEXT: pand %xmm4, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: pand %xmm4, %xmm3 ; SSE41-NEXT: pand %xmm4, %xmm1 ; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: paddd %xmm2, %xmm1 ; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: paddd %xmm1, %xmm0 @@ -545,8 +545,8 @@ ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -563,8 +563,8 @@ ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 @@ -609,19 +609,19 @@ ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE2-NEXT: pand %xmm8, %xmm5 ; SSE2-NEXT: pand %xmm8, %xmm1 +; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: pand %xmm8, %xmm3 ; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm3 ; SSE2-NEXT: paddd %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm8, %xmm4 ; SSE2-NEXT: pand %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: pand %xmm8, %xmm6 ; SSE2-NEXT: pand %xmm8, %xmm2 ; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -634,19 +634,19 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] ; SSE41-NEXT: pand %xmm8, %xmm5 ; SSE41-NEXT: pand %xmm8, %xmm1 +; SSE41-NEXT: paddd %xmm5, %xmm1 ; SSE41-NEXT: pand %xmm8, %xmm7 ; SSE41-NEXT: pand %xmm8, %xmm3 ; SSE41-NEXT: paddd %xmm7, %xmm3 -; SSE41-NEXT: paddd %xmm5, %xmm3 ; SSE41-NEXT: paddd %xmm1, %xmm3 ; SSE41-NEXT: pand %xmm8, %xmm4 ; SSE41-NEXT: pand %xmm8, %xmm0 +; SSE41-NEXT: paddd %xmm4, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm6 ; SSE41-NEXT: pand %xmm8, %xmm2 ; SSE41-NEXT: paddd %xmm6, %xmm2 -; SSE41-NEXT: paddd %xmm4, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 ; SSE41-NEXT: paddd %xmm0, %xmm2 +; SSE41-NEXT: paddd %xmm3, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] ; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -662,16 +662,16 @@ ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm5 +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -688,16 +688,16 @@ ; AVX1-FAST-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-FAST-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm5 +; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 @@ -710,10 +710,10 @@ ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2 ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -1144,9 +1144,9 @@ ; SSE2-NEXT: psadbw %xmm1, %xmm2 ; SSE2-NEXT: paddq %xmm6, %xmm2 ; SSE2-NEXT: psadbw %xmm1, %xmm4 -; SSE2-NEXT: paddq %xmm2, %xmm4 ; SSE2-NEXT: psadbw %xmm1, %xmm0 ; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: paddq %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -1173,9 +1173,9 @@ ; SSE41-NEXT: psadbw %xmm1, %xmm2 ; SSE41-NEXT: paddq %xmm6, %xmm2 ; SSE41-NEXT: psadbw %xmm1, %xmm4 -; SSE41-NEXT: paddq %xmm2, %xmm4 ; SSE41-NEXT: psadbw %xmm1, %xmm0 ; SSE41-NEXT: paddq %xmm4, %xmm0 +; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: movd %xmm1, %eax @@ -1200,10 +1200,10 @@ ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-sext.ll @@ -122,10 +122,10 @@ ; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; SSE2-NEXT: paddq %xmm4, %xmm5 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: paddq %xmm1, %xmm0 -; SSE2-NEXT: paddq %xmm4, %xmm0 ; SSE2-NEXT: paddq %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddq %xmm0, %xmm1 @@ -137,14 +137,14 @@ ; SSE41-NEXT: pmovsxbq %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1] ; SSE41-NEXT: pmovsxbq %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrlq $48, %xmm3 -; SSE41-NEXT: pmovsxbq %xmm3, %xmm3 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrlq $48, %xmm1 +; SSE41-NEXT: pmovsxbq %xmm1, %xmm1 ; SSE41-NEXT: psrld $16, %xmm0 ; SSE41-NEXT: pmovsxbq %xmm0, %xmm0 -; SSE41-NEXT: paddq %xmm3, %xmm0 -; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: paddq %xmm1, %xmm0 +; SSE41-NEXT: paddq %xmm2, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: movq %xmm1, %rax @@ -160,7 +160,7 @@ ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -202,12 +202,12 @@ ; SSE2-LABEL: test_v16i64_v16i8: ; SSE2: # %bb.0: ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: psrad $24, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] @@ -216,28 +216,28 @@ ; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 ; SSE2-NEXT: movdqa %xmm0, %xmm8 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; SSE2-NEXT: paddq %xmm5, %xmm8 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $24, %xmm4 -; SSE2-NEXT: pxor %xmm9, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm9 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: movdqa %xmm4, %xmm9 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $24, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm6, %xmm11 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3] -; SSE2-NEXT: paddq %xmm10, %xmm11 -; SSE2-NEXT: paddq %xmm5, %xmm11 -; SSE2-NEXT: paddq %xmm8, %xmm11 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] +; SSE2-NEXT: paddq %xmm9, %xmm10 +; SSE2-NEXT: paddq %xmm8, %xmm10 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; SSE2-NEXT: paddq %xmm4, %xmm6 -; SSE2-NEXT: paddq %xmm1, %xmm6 -; SSE2-NEXT: paddq %xmm11, %xmm6 ; SSE2-NEXT: paddq %xmm0, %xmm6 +; SSE2-NEXT: paddq %xmm10, %xmm6 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] ; SSE2-NEXT: paddq %xmm6, %xmm0 ; SSE2-NEXT: movq %xmm0, %rax @@ -247,31 +247,31 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovsxbq %xmm2, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrlq $48, %xmm3 -; SSE41-NEXT: pmovsxbq %xmm3, %xmm3 -; SSE41-NEXT: paddq %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] +; SSE41-NEXT: pmovsxbq %xmm0, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1] ; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: pmovsxbq %xmm0, %xmm0 -; SSE41-NEXT: paddq %xmm0, %xmm3 ; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmovsxbq %xmm1, %xmm0 -; SSE41-NEXT: paddq %xmm0, %xmm3 -; SSE41-NEXT: pmovsxbq %xmm4, %xmm0 -; SSE41-NEXT: pmovsxbq %xmm5, %xmm1 -; SSE41-NEXT: pmovsxbq %xmm6, %xmm4 -; SSE41-NEXT: paddq %xmm1, %xmm4 -; SSE41-NEXT: paddq %xmm0, %xmm4 -; SSE41-NEXT: paddq %xmm3, %xmm4 -; SSE41-NEXT: paddq %xmm2, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE41-NEXT: pmovsxbq %xmm1, %xmm1 +; SSE41-NEXT: paddq %xmm0, %xmm1 +; SSE41-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pmovsxbq %xmm2, %xmm0 +; SSE41-NEXT: psrlq $48, %xmm3 +; SSE41-NEXT: pmovsxbq %xmm3, %xmm2 +; SSE41-NEXT: paddq %xmm0, %xmm2 +; SSE41-NEXT: paddq %xmm1, %xmm2 +; SSE41-NEXT: pmovsxbq %xmm5, %xmm0 ; SSE41-NEXT: paddq %xmm4, %xmm0 +; SSE41-NEXT: pmovsxbq %xmm6, %xmm1 +; SSE41-NEXT: pmovsxbq %xmm7, %xmm3 +; SSE41-NEXT: paddq %xmm1, %xmm3 +; SSE41-NEXT: paddq %xmm0, %xmm3 +; SSE41-NEXT: paddq %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] +; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: movq %xmm0, %rax ; SSE41-NEXT: retq ; @@ -283,23 +283,23 @@ ; AVX1-NEXT: vpmovsxbw %xmm2, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1] ; AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[3,3,3,3] -; AVX1-NEXT: vpmovsxwq %xmm6, %xmm6 -; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3] -; AVX1-NEXT: vpmovsxwq %xmm7, %xmm7 -; AVX1-NEXT: vpaddq %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[2,3,2,3] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm4 +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm5, %xmm5 +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,3,3,3] +; AVX1-NEXT: vpmovsxwq %xmm6, %xmm6 +; AVX1-NEXT: vpaddq %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-NEXT: vpmovsxwq %xmm4, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-NEXT: vpmovsxwq %xmm3, %xmm3 ; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 ; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -317,7 +317,7 @@ ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 ; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -562,12 +562,12 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: paddd %xmm3, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -581,13 +581,13 @@ ; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovsxbd %xmm3, %xmm3 +; SSE41-NEXT: paddd %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] +; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: paddd %xmm3, %xmm0 -; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -600,12 +600,12 @@ ; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -619,12 +619,12 @@ ; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm1 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -676,27 +676,27 @@ ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: paddd %xmm3, %xmm5 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE2-NEXT: psrad $24, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: psrad $24, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: psrad $24, %xmm7 -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm5, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: psrad $24, %xmm6 +; SSE2-NEXT: paddd %xmm3, %xmm6 +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: paddd %xmm6, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -710,23 +710,23 @@ ; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; SSE41-NEXT: pmovsxbd %xmm3, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] +; SSE41-NEXT: paddd %xmm2, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] ; SSE41-NEXT: pmovsxbd %xmm4, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; SSE41-NEXT: pmovsxbd %xmm5, %xmm5 -; SSE41-NEXT: paddd %xmm4, %xmm5 -; SSE41-NEXT: paddd %xmm2, %xmm5 -; SSE41-NEXT: paddd %xmm3, %xmm5 +; SSE41-NEXT: paddd %xmm2, %xmm4 +; SSE41-NEXT: paddd %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbd %xmm3, %xmm3 ; SSE41-NEXT: paddd %xmm2, %xmm3 ; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 -; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: paddd %xmm5, %xmm1 ; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 ; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: paddd %xmm3, %xmm0 +; SSE41-NEXT: paddd %xmm4, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: paddd %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -741,23 +741,23 @@ ; AVX1-SLOW-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] -; AVX1-SLOW-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX1-SLOW-NEXT: vpmovsxbd %xmm5, %xmm5 -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-SLOW-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-SLOW-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -773,23 +773,23 @@ ; AVX1-FAST-NEXT: vpmovsxbd %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] ; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] -; AVX1-FAST-NEXT: vpmovsxbd %xmm4, %xmm4 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3] -; AVX1-FAST-NEXT: vpmovsxbd %xmm5, %xmm5 -; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4 -; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vpmovsxbd %xmm4, %xmm4 +; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-FAST-NEXT: vpmovsxbd %xmm3, %xmm3 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpmovsxbd %xmm4, %xmm4 ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm4, %xmm3 ; AVX1-FAST-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpmovsxbd %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 @@ -806,9 +806,9 @@ ; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1171,12 +1171,12 @@ ; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: paddw %xmm2, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm0 ; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 ; SSE2-NEXT: paddw %xmm3, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: paddw %xmm0, %xmm1 @@ -1197,9 +1197,9 @@ ; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 ; SSE41-NEXT: paddw %xmm2, %xmm3 ; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm1 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 ; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: paddw %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -1220,9 +1220,9 @@ ; AVX1-SLOW-NEXT: vpmovsxbw %xmm3, %xmm3 ; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 ; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1243,9 +1243,9 @@ ; AVX1-FAST-NEXT: vpmovsxbw %xmm3, %xmm3 ; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 ; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1304,25 +1304,25 @@ ; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; SSE2-NEXT: paddw %xmm4, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: paddw %xmm6, %xmm7 -; SSE2-NEXT: paddw %xmm4, %xmm7 -; SSE2-NEXT: paddw %xmm5, %xmm7 +; SSE2-NEXT: paddw %xmm4, %xmm6 +; SSE2-NEXT: paddw %xmm5, %xmm6 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: paddw %xmm3, %xmm1 ; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: paddw %xmm7, %xmm1 ; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm6, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: paddw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1338,23 +1338,23 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pmovsxbw %xmm2, %xmm4 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm5 -; SSE41-NEXT: pmovsxbw %xmm3, %xmm6 -; SSE41-NEXT: pmovsxbw %xmm1, %xmm7 -; SSE41-NEXT: paddw %xmm6, %xmm7 -; SSE41-NEXT: paddw %xmm4, %xmm7 +; SSE41-NEXT: paddw %xmm4, %xmm5 +; SSE41-NEXT: pmovsxbw %xmm3, %xmm4 +; SSE41-NEXT: pmovsxbw %xmm1, %xmm6 +; SSE41-NEXT: paddw %xmm4, %xmm6 +; SSE41-NEXT: paddw %xmm5, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] -; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 +; SSE41-NEXT: paddw %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] +; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 -; SSE41-NEXT: paddw %xmm3, %xmm1 ; SSE41-NEXT: paddw %xmm2, %xmm1 ; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm7, %xmm1 -; SSE41-NEXT: paddw %xmm5, %xmm1 +; SSE41-NEXT: paddw %xmm6, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE41-NEXT: paddw %xmm1, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1370,25 +1370,25 @@ ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm2 ; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm3 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-SLOW-NEXT: vpmovsxbw %xmm4, %xmm5 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-SLOW-NEXT: vpmovsxbw %xmm6, %xmm7 -; AVX1-SLOW-NEXT: vpaddw %xmm5, %xmm7, %xmm5 -; AVX1-SLOW-NEXT: vpaddw %xmm5, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-SLOW-NEXT: vpmovsxbw %xmm3, %xmm4 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-SLOW-NEXT: vpmovsxbw %xmm5, %xmm6 +; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm6, %xmm4 +; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-SLOW-NEXT: vpmovsxbw %xmm4, %xmm4 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-SLOW-NEXT: vpmovsxbw %xmm5, %xmm5 -; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm5, %xmm4 -; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-SLOW-NEXT: vpmovsxbw %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; AVX1-SLOW-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm3, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1404,25 +1404,25 @@ ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm2 ; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm3 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-FAST-NEXT: vpmovsxbw %xmm4, %xmm5 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-FAST-NEXT: vpmovsxbw %xmm6, %xmm7 -; AVX1-FAST-NEXT: vpaddw %xmm5, %xmm7, %xmm5 -; AVX1-FAST-NEXT: vpaddw %xmm5, %xmm2, %xmm2 +; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-FAST-NEXT: vpmovsxbw %xmm3, %xmm4 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-FAST-NEXT: vpmovsxbw %xmm5, %xmm6 +; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm6, %xmm4 +; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; AVX1-FAST-NEXT: vpmovsxbw %xmm4, %xmm4 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] -; AVX1-FAST-NEXT: vpmovsxbw %xmm5, %xmm5 -; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm5, %xmm4 -; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-FAST-NEXT: vpmovsxbw %xmm1, %xmm1 +; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[2,3,2,3] +; AVX1-FAST-NEXT: vpmovsxbw %xmm3, %xmm3 +; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpaddw %xmm0, %xmm2, %xmm0 -; AVX1-FAST-NEXT: vpaddw %xmm0, %xmm3, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1437,13 +1437,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm2 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm3 +; AVX2-NEXT: vpaddw %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpaddw %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1811,12 +1811,12 @@ ; SSE2-NEXT: pcmpgtb %xmm2, %xmm5 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE2-NEXT: paddb %xmm5, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpgtb %xmm3, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 ; SSE2-NEXT: paddb %xmm0, %xmm3 -; SSE2-NEXT: paddb %xmm5, %xmm3 ; SSE2-NEXT: paddb %xmm2, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE2-NEXT: paddb %xmm3, %xmm0 @@ -1832,12 +1832,12 @@ ; SSE41-NEXT: pcmpgtb %xmm2, %xmm5 ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE41-NEXT: paddb %xmm5, %xmm2 ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pcmpgtb %xmm3, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 ; SSE41-NEXT: paddb %xmm0, %xmm3 -; SSE41-NEXT: paddb %xmm5, %xmm3 ; SSE41-NEXT: paddb %xmm2, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] ; SSE41-NEXT: paddb %xmm3, %xmm0 @@ -1851,13 +1851,13 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm2, %xmm0, %xmm0 @@ -1913,27 +1913,27 @@ ; SSE2-NEXT: pcmpgtb %xmm4, %xmm9 ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 +; SSE2-NEXT: paddb %xmm9, %xmm4 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpgtb %xmm6, %xmm0 ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pcmpgtb %xmm2, %xmm6 ; SSE2-NEXT: paddb %xmm0, %xmm6 -; SSE2-NEXT: paddb %xmm9, %xmm6 +; SSE2-NEXT: paddb %xmm4, %xmm6 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pcmpgtb %xmm5, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm7, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm5 -; SSE2-NEXT: paddb %xmm1, %xmm5 -; SSE2-NEXT: paddb %xmm0, %xmm5 -; SSE2-NEXT: paddb %xmm2, %xmm5 -; SSE2-NEXT: paddb %xmm6, %xmm5 -; SSE2-NEXT: paddb %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE2-NEXT: paddb %xmm5, %xmm0 +; SSE2-NEXT: pcmpgtb %xmm3, %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: paddb %xmm6, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: psadbw %xmm8, %xmm0 ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax @@ -1946,27 +1946,27 @@ ; SSE41-NEXT: pcmpgtb %xmm4, %xmm9 ; SSE41-NEXT: pxor %xmm4, %xmm4 ; SSE41-NEXT: pcmpgtb %xmm0, %xmm4 +; SSE41-NEXT: paddb %xmm9, %xmm4 ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pcmpgtb %xmm6, %xmm0 ; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: pcmpgtb %xmm2, %xmm6 ; SSE41-NEXT: paddb %xmm0, %xmm6 -; SSE41-NEXT: paddb %xmm9, %xmm6 +; SSE41-NEXT: paddb %xmm4, %xmm6 ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pcmpgtb %xmm5, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE41-NEXT: paddb %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pcmpgtb %xmm7, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pcmpgtb %xmm7, %xmm1 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pcmpgtb %xmm3, %xmm5 -; SSE41-NEXT: paddb %xmm1, %xmm5 -; SSE41-NEXT: paddb %xmm0, %xmm5 -; SSE41-NEXT: paddb %xmm2, %xmm5 -; SSE41-NEXT: paddb %xmm6, %xmm5 -; SSE41-NEXT: paddb %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] -; SSE41-NEXT: paddb %xmm5, %xmm0 +; SSE41-NEXT: pcmpgtb %xmm3, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: paddb %xmm2, %xmm1 +; SSE41-NEXT: paddb %xmm6, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE41-NEXT: paddb %xmm1, %xmm0 ; SSE41-NEXT: psadbw %xmm8, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: # kill: def $al killed $al killed $eax @@ -1977,23 +1977,23 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm7 -; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm8 -; AVX1-NEXT: vpaddb %xmm7, %xmm8, %xmm7 -; AVX1-NEXT: vpaddb %xmm7, %xmm5, %xmm5 +; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm7 +; AVX1-NEXT: vpaddb %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpaddb %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpcmpgtb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpcmpgtb %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm6, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0 @@ -2007,10 +2007,10 @@ ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: vpcmpgtb %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpcmpgtb %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtb %ymm3, %ymm4, %ymm2 ; AVX2-NEXT: vpcmpgtb %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-zext.ll @@ -594,9 +594,9 @@ ; SSE-NEXT: psadbw %xmm4, %xmm1 ; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: psadbw %xmm4, %xmm2 -; SSE-NEXT: paddq %xmm1, %xmm2 ; SSE-NEXT: psadbw %xmm4, %xmm0 ; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddq %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax @@ -612,9 +612,9 @@ ; AVX1-NEXT: vpsadbw %xmm3, %xmm4, %xmm4 ; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-add.ll b/llvm/test/CodeGen/X86/vector-reduce-add.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add.ll @@ -82,11 +82,11 @@ ; SSE-LABEL: test_v8i64: ; SSE: # %bb.0: ; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm2, %xmm1 -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i64: @@ -94,8 +94,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -132,30 +132,30 @@ ; SSE-LABEL: test_v16i64: ; SSE: # %bb.0: ; SSE-NEXT: paddq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm5, %xmm3 -; SSE-NEXT: paddq %xmm1, %xmm3 -; SSE-NEXT: paddq %xmm4, %xmm2 -; SSE-NEXT: paddq %xmm3, %xmm2 -; SSE-NEXT: paddq %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: paddq %xmm4, %xmm0 ; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: paddq %xmm7, %xmm3 +; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: paddq %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddq %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -165,7 +165,7 @@ ; AVX2-LABEL: test_v16i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -344,13 +344,13 @@ ; SSE-LABEL: test_v16i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: retq ; ; AVX1-SLOW-LABEL: test_v16i32: @@ -358,8 +358,8 @@ ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -373,8 +373,8 @@ ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 @@ -416,14 +416,14 @@ ; SSE-LABEL: test_v32i32: ; SSE: # %bb.0: ; SSE-NEXT: paddd %xmm6, %xmm2 -; SSE-NEXT: paddd %xmm7, %xmm3 -; SSE-NEXT: paddd %xmm5, %xmm3 -; SSE-NEXT: paddd %xmm1, %xmm3 -; SSE-NEXT: paddd %xmm4, %xmm2 -; SSE-NEXT: paddd %xmm3, %xmm2 -; SSE-NEXT: paddd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: paddd %xmm4, %xmm0 ; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm7, %xmm3 +; SSE-NEXT: paddd %xmm5, %xmm1 +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax @@ -432,16 +432,16 @@ ; AVX1-SLOW-LABEL: test_v32i32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm5 +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm4, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -453,16 +453,16 @@ ; AVX1-FAST-LABEL: test_v32i32: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm5 +; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm5, %xmm4 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm4, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 @@ -473,7 +473,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -746,16 +746,16 @@ ; SSE-LABEL: test_v32i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddw %xmm2, %xmm0 ; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; @@ -764,8 +764,8 @@ ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 -; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -782,8 +782,8 @@ ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 -; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -834,14 +834,14 @@ ; SSE-LABEL: test_v64i16: ; SSE: # %bb.0: ; SSE-NEXT: paddw %xmm6, %xmm2 -; SSE-NEXT: paddw %xmm7, %xmm3 -; SSE-NEXT: paddw %xmm5, %xmm3 -; SSE-NEXT: paddw %xmm1, %xmm3 -; SSE-NEXT: paddw %xmm4, %xmm2 -; SSE-NEXT: paddw %xmm3, %xmm2 -; SSE-NEXT: paddw %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: paddw %xmm4, %xmm0 ; SSE-NEXT: paddw %xmm2, %xmm0 +; SSE-NEXT: paddw %xmm7, %xmm3 +; SSE-NEXT: paddw %xmm5, %xmm1 +; SSE-NEXT: paddw %xmm3, %xmm1 +; SSE-NEXT: paddw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddw %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -854,16 +854,16 @@ ; AVX1-SLOW-LABEL: test_v64i16: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm4 +; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm5 +; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm5, %xmm4 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpaddw %xmm0, %xmm4, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -878,16 +878,16 @@ ; AVX1-FAST-LABEL: test_v64i16: ; AVX1-FAST: # %bb.0: ; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm4 +; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm5 +; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm5, %xmm4 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX1-FAST-NEXT: vpaddw %xmm0, %xmm4, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -901,7 +901,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 @@ -1231,13 +1231,13 @@ ; SSE-LABEL: test_v64i8: ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm3, %xmm1 -; SSE-NEXT: paddb %xmm2, %xmm1 -; SSE-NEXT: paddb %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: paddb %xmm2, %xmm0 ; SSE-NEXT: paddb %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: psadbw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: paddb %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: psadbw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; @@ -1246,8 +1246,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1293,17 +1293,17 @@ ; SSE-LABEL: test_v128i8: ; SSE: # %bb.0: ; SSE-NEXT: paddb %xmm7, %xmm3 -; SSE-NEXT: paddb %xmm5, %xmm3 -; SSE-NEXT: paddb %xmm1, %xmm3 +; SSE-NEXT: paddb %xmm5, %xmm1 +; SSE-NEXT: paddb %xmm3, %xmm1 ; SSE-NEXT: paddb %xmm6, %xmm2 -; SSE-NEXT: paddb %xmm4, %xmm2 -; SSE-NEXT: paddb %xmm3, %xmm2 -; SSE-NEXT: paddb %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: paddb %xmm4, %xmm0 ; SSE-NEXT: paddb %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: psadbw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: paddb %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: psadbw %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; @@ -1313,13 +1313,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1332,7 +1332,7 @@ ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -747,10 +747,10 @@ ; SSE-LABEL: trunc_v64i8_v64i1: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: cmpw $-1, %ax ; SSE-NEXT: sete %al ; SSE-NEXT: retq @@ -783,8 +783,8 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1284,10 +1284,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: cmpw $-1, %ax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq @@ -1295,9 +1295,9 @@ ; SSE41-LABEL: icmp0_v8i64_v8i1: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -1353,10 +1353,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: cmpw $-1, %ax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq @@ -1364,9 +1364,9 @@ ; SSE41-LABEL: icmp0_v16i32_v16i1: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -1403,10 +1403,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: cmpw $-1, %ax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq @@ -1414,9 +1414,9 @@ ; SSE41-LABEL: icmp0_v32i16_v32i1: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -1482,10 +1482,10 @@ ; SSE2-LABEL: icmp0_v64i8_v64i1: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpw $-1, %ax ; SSE2-NEXT: sete %al @@ -1494,9 +1494,9 @@ ; SSE41-LABEL: icmp0_v64i8_v64i1: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -1525,8 +1525,8 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2199,10 +2199,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: cmpw $-1, %ax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq @@ -2211,10 +2211,10 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm4, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: psubb %xmm4, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -2280,10 +2280,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: cmpw $-1, %ax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq @@ -2292,10 +2292,10 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm4, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: psubb %xmm4, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -2340,10 +2340,10 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm4, %xmm1 -; SSE2-NEXT: pmovmskb %xmm1, %eax +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: cmpw $-1, %ax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq @@ -2352,10 +2352,10 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 ; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm4, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: psubb %xmm4, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -2430,10 +2430,10 @@ ; SSE2-LABEL: icmp1_v64i8_v64i1: ; SSE2: # %bb.0: ; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm3, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpw $-1, %ax ; SSE2-NEXT: sete %al @@ -2442,11 +2442,11 @@ ; SSE41-LABEL: icmp1_v64i8_v64i1: ; SSE41: # %bb.0: ; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: psubb %xmm3, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pand %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: psubb %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -2482,8 +2482,8 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -3126,9 +3126,9 @@ ; SSE-NEXT: pcmpeqb %xmm5, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pcmpeqb %xmm6, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: cmpw $-1, %ax ; SSE-NEXT: sete %al @@ -3144,9 +3144,9 @@ ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpw $-1, %ax ; AVX1-NEXT: sete %al @@ -3201,9 +3201,9 @@ ; SSE-NEXT: pcmpeqb %xmm5, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pcmpeqb %xmm6, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: cmpw $-1, %ax ; SSE-NEXT: sete %al @@ -3219,9 +3219,9 @@ ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpw $-1, %ax ; AVX1-NEXT: sete %al @@ -3257,9 +3257,9 @@ ; SSE-NEXT: pcmpeqb %xmm5, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pcmpeqb %xmm6, %xmm2 -; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: cmpw $-1, %ax ; SSE-NEXT: sete %al @@ -3275,9 +3275,9 @@ ; AVX1-NEXT: vpcmpeqb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpw $-1, %ax ; AVX1-NEXT: sete %al @@ -3341,10 +3341,10 @@ ; SSE: # %bb.0: ; SSE-NEXT: pcmpeqb %xmm6, %xmm2 ; SSE-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pcmpeqb %xmm7, %xmm3 ; SSE-NEXT: pcmpeqb %xmm5, %xmm1 ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pmovmskb %xmm1, %eax ; SSE-NEXT: cmpw $-1, %ax @@ -3355,6 +3355,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 @@ -3363,7 +3364,6 @@ ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: cmpw $-1, %ax ; AVX1-NEXT: sete %al @@ -3389,7 +3389,7 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpand %xmm1, %xmm3, %xmm1 -; AVX512F-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -88,11 +88,11 @@ ; SSE-LABEL: test_v8i64: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: sete %al ; SSE-NEXT: retq @@ -145,14 +145,14 @@ ; SSE-LABEL: test_v16i64: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: setne %al @@ -161,7 +161,7 @@ ; AVX1-LABEL: test_v16i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -176,7 +176,7 @@ ; AVX2-LABEL: test_v16i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -324,13 +324,13 @@ ; SSE-LABEL: test_v16i32: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: setne %al ; SSE-NEXT: retq @@ -389,14 +389,14 @@ ; SSE-LABEL: test_v32i32: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax @@ -407,7 +407,7 @@ ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -424,7 +424,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -619,16 +619,16 @@ ; SSE-LABEL: test_v32i16: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: sete %al ; SSE-NEXT: retq @@ -693,14 +693,14 @@ ; SSE-LABEL: test_v64i16: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -714,7 +714,7 @@ ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -733,7 +733,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -980,19 +980,19 @@ ; SSE-LABEL: test_v64i8: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: setne %al ; SSE-NEXT: retq @@ -1063,14 +1063,14 @@ ; SSE-LABEL: test_v128i8: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -1087,7 +1087,7 @@ ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -1108,7 +1108,7 @@ ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -74,11 +74,11 @@ ; SSE-LABEL: test_v8i64: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i64: @@ -122,21 +122,21 @@ ; SSE-LABEL: test_v16i64: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -149,7 +149,7 @@ ; AVX2-LABEL: test_v16i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -273,13 +273,13 @@ ; SSE-LABEL: test_v16i32: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i32: @@ -329,14 +329,14 @@ ; SSE-LABEL: test_v32i32: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax @@ -345,7 +345,7 @@ ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -360,7 +360,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -536,16 +536,16 @@ ; SSE-LABEL: test_v32i16: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; @@ -605,14 +605,14 @@ ; SSE-LABEL: test_v64i16: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -625,7 +625,7 @@ ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -643,7 +643,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -870,19 +870,19 @@ ; SSE-LABEL: test_v64i8: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; @@ -948,14 +948,14 @@ ; SSE-LABEL: test_v128i8: ; SSE: # %bb.0: ; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -971,7 +971,7 @@ ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -991,7 +991,7 @@ ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -917,13 +917,13 @@ ; SSE41-LABEL: test_v16i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pmulld %xmm3, %xmm1 -; SSE41-NEXT: pmulld %xmm2, %xmm1 -; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE41-NEXT: pmulld %xmm2, %xmm0 ; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE41-NEXT: pmulld %xmm1, %xmm0 +; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_v16i32: @@ -931,8 +931,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1007,14 +1007,14 @@ ; SSE41-LABEL: test_v32i32: ; SSE41: # %bb.0: ; SSE41-NEXT: pmulld %xmm6, %xmm2 -; SSE41-NEXT: pmulld %xmm7, %xmm3 -; SSE41-NEXT: pmulld %xmm5, %xmm3 -; SSE41-NEXT: pmulld %xmm1, %xmm3 -; SSE41-NEXT: pmulld %xmm4, %xmm2 -; SSE41-NEXT: pmulld %xmm3, %xmm2 -; SSE41-NEXT: pmulld %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE41-NEXT: pmulld %xmm4, %xmm0 ; SSE41-NEXT: pmulld %xmm2, %xmm0 +; SSE41-NEXT: pmulld %xmm7, %xmm3 +; SSE41-NEXT: pmulld %xmm5, %xmm1 +; SSE41-NEXT: pmulld %xmm3, %xmm1 +; SSE41-NEXT: pmulld %xmm0, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE41-NEXT: pmulld %xmm1, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE41-NEXT: pmulld %xmm0, %xmm1 ; SSE41-NEXT: movd %xmm1, %eax @@ -1023,16 +1023,16 @@ ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpmulld %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1044,7 +1044,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 @@ -1220,16 +1220,16 @@ ; SSE-LABEL: test_v32i16: ; SSE: # %bb.0: ; SSE-NEXT: pmullw %xmm3, %xmm1 -; SSE-NEXT: pmullw %xmm2, %xmm1 -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pmullw %xmm2, %xmm0 ; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; @@ -1238,8 +1238,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1342,14 +1342,14 @@ ; SSE-LABEL: test_v64i16: ; SSE: # %bb.0: ; SSE-NEXT: pmullw %xmm6, %xmm2 -; SSE-NEXT: pmullw %xmm7, %xmm3 -; SSE-NEXT: pmullw %xmm5, %xmm3 -; SSE-NEXT: pmullw %xmm1, %xmm3 -; SSE-NEXT: pmullw %xmm4, %xmm2 -; SSE-NEXT: pmullw %xmm3, %xmm2 -; SSE-NEXT: pmullw %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pmullw %xmm4, %xmm0 ; SSE-NEXT: pmullw %xmm2, %xmm0 +; SSE-NEXT: pmullw %xmm7, %xmm3 +; SSE-NEXT: pmullw %xmm5, %xmm1 +; SSE-NEXT: pmullw %xmm3, %xmm1 +; SSE-NEXT: pmullw %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pmullw %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pmullw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -1362,16 +1362,16 @@ ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1386,7 +1386,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1442,8 +1442,8 @@ ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1462,8 +1462,8 @@ ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -1650,9 +1650,9 @@ ; SSE2-NEXT: pmullw %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm1, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -1671,8 +1671,8 @@ ; SSE41-NEXT: pmullw %xmm2, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: pmullw %xmm2, %xmm0 +; SSE41-NEXT: pmullw %xmm1, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -1691,8 +1691,8 @@ ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1753,19 +1753,19 @@ ; SSE2-NEXT: pmullw %xmm4, %xmm3 ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm3, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm4, %xmm1 +; SSE2-NEXT: pmullw %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm3, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm2, %xmm3 -; SSE2-NEXT: pmullw %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm3, %xmm0 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -1784,16 +1784,16 @@ ; SSE41-NEXT: pmullw %xmm4, %xmm3 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: pmullw %xmm4, %xmm1 +; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm3, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm3, %xmm0 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pmullw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -1813,17 +1813,17 @@ ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1841,8 +1841,8 @@ ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1904,8 +1904,8 @@ ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1927,8 +1927,8 @@ ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -1955,39 +1955,39 @@ ; SSE2-NEXT: pmullw %xmm8, %xmm7 ; SSE2-NEXT: movdqa %xmm3, %xmm8 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm7, %xmm8 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm8, %xmm3 +; SSE2-NEXT: pmullw %xmm7, %xmm3 ; SSE2-NEXT: movdqa %xmm5, %xmm7 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm7, %xmm5 ; SSE2-NEXT: movdqa %xmm1, %xmm7 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm5, %xmm7 -; SSE2-NEXT: pmullw %xmm3, %xmm7 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm7, %xmm1 +; SSE2-NEXT: pmullw %xmm5, %xmm1 +; SSE2-NEXT: pmullw %xmm3, %xmm1 ; SSE2-NEXT: movdqa %xmm6, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm3, %xmm6 ; SSE2-NEXT: movdqa %xmm2, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm6, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm3, %xmm2 +; SSE2-NEXT: pmullw %xmm6, %xmm2 ; SSE2-NEXT: movdqa %xmm4, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm3, %xmm4 ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm4, %xmm3 -; SSE2-NEXT: pmullw %xmm2, %xmm3 -; SSE2-NEXT: pmullw %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pmullw %xmm3, %xmm0 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: pmullw %xmm2, %xmm0 +; SSE2-NEXT: pmullw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmullw %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -2006,32 +2006,32 @@ ; SSE41-NEXT: pmullw %xmm8, %xmm7 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm7, %xmm3 ; SSE41-NEXT: pmullw %xmm8, %xmm3 +; SSE41-NEXT: pmullw %xmm7, %xmm3 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm7, %xmm5 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm7, %xmm1 ; SSE41-NEXT: pmullw %xmm5, %xmm1 ; SSE41-NEXT: pmullw %xmm3, %xmm1 -; SSE41-NEXT: pmullw %xmm7, %xmm1 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm3, %xmm6 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm6, %xmm2 ; SSE41-NEXT: pmullw %xmm3, %xmm2 +; SSE41-NEXT: pmullw %xmm6, %xmm2 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm3, %xmm4 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmullw %xmm3, %xmm0 ; SSE41-NEXT: pmullw %xmm4, %xmm0 ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pmullw %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -2051,8 +2051,8 @@ ; AVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm4 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] @@ -2060,26 +2060,26 @@ ; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero +; AVX1-NEXT: vpmullw %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; AVX1-NEXT: vpmullw %xmm5, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -2097,16 +2097,16 @@ ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 @@ -2127,8 +2127,8 @@ ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 @@ -2151,8 +2151,8 @@ ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] ; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm1, %zmm1 ; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm2, %zmm1 ; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] +; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 @@ -2177,17 +2177,17 @@ ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm4, %ymm2 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpmullw %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm3, %ymm1 -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQ-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -2209,17 +2209,17 @@ ; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm4, %ymm2 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm3, %ymm3 ; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm3, %ymm1 -; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -741,10 +741,10 @@ ; SSE-LABEL: trunc_v64i8_v64i1: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: testl %eax, %eax ; SSE-NEXT: setne %al ; SSE-NEXT: retq @@ -777,8 +777,8 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1582,10 +1582,10 @@ ; SSE-NEXT: pxor %xmm4, %xmm4 ; SSE-NEXT: pcmpeqb %xmm4, %xmm2 ; SSE-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pcmpeqb %xmm4, %xmm3 ; SSE-NEXT: pcmpeqb %xmm4, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pmovmskb %xmm1, %eax ; SSE-NEXT: testl %eax, %eax @@ -1597,13 +1597,13 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al @@ -1631,8 +1631,8 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2418,10 +2418,10 @@ ; SSE: # %bb.0: ; SSE-NEXT: pcmpeqb %xmm6, %xmm2 ; SSE-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pcmpeqb %xmm7, %xmm3 ; SSE-NEXT: pcmpeqb %xmm5, %xmm1 ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pmovmskb %xmm1, %eax ; SSE-NEXT: testl %eax, %eax @@ -2432,6 +2432,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 @@ -2440,7 +2441,6 @@ ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax ; AVX1-NEXT: setne %al @@ -2467,7 +2467,7 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -69,11 +69,11 @@ ; SSE2-LABEL: test_v8i64: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq @@ -81,9 +81,9 @@ ; SSE41-LABEL: test_v8i64: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -120,15 +120,15 @@ ; SSE2-LABEL: test_v16i64: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq @@ -136,20 +136,20 @@ ; SSE41-LABEL: test_v16i64: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: ptest %xmm2, %xmm2 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_v16i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vptest %ymm0, %ymm0 ; AVX1-NEXT: setne %al @@ -159,7 +159,7 @@ ; AVX2-LABEL: test_v16i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vptest %ymm0, %ymm0 ; AVX2-NEXT: setne %al @@ -262,11 +262,11 @@ ; SSE2-LABEL: test_v16i32: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq @@ -274,9 +274,9 @@ ; SSE41-LABEL: test_v16i32: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -313,15 +313,15 @@ ; SSE2-LABEL: test_v32i32: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq @@ -329,20 +329,20 @@ ; SSE41-LABEL: test_v32i32: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: ptest %xmm2, %xmm2 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vptest %ymm0, %ymm0 ; AVX1-NEXT: sete %al @@ -352,7 +352,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vptest %ymm0, %ymm0 ; AVX2-NEXT: sete %al @@ -474,11 +474,11 @@ ; SSE2-LABEL: test_v32i16: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq @@ -486,9 +486,9 @@ ; SSE41-LABEL: test_v32i16: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; @@ -525,15 +525,15 @@ ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq @@ -541,20 +541,20 @@ ; SSE41-LABEL: test_v64i16: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: ptest %xmm2, %xmm2 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vptest %ymm0, %ymm0 ; AVX1-NEXT: setne %al @@ -564,7 +564,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vptest %ymm0, %ymm0 ; AVX2-NEXT: setne %al @@ -705,11 +705,11 @@ ; SSE2-LABEL: test_v64i8: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq @@ -717,9 +717,9 @@ ; SSE41-LABEL: test_v64i8: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: ptest %xmm1, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; @@ -756,15 +756,15 @@ ; SSE2-LABEL: test_v128i8: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE2-NEXT: pmovmskb %xmm0, %eax +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 +; SSE2-NEXT: pmovmskb %xmm1, %eax ; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq @@ -772,20 +772,20 @@ ; SSE41-LABEL: test_v128i8: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: ptest %xmm2, %xmm2 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest %xmm0, %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vptest %ymm0, %ymm0 ; AVX1-NEXT: sete %al @@ -795,7 +795,7 @@ ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vptest %ymm0, %ymm0 ; AVX2-NEXT: sete %al @@ -964,14 +964,14 @@ ; SSE2-LABEL: mask_v128i8: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: psllw $7, %xmm2 -; SSE2-NEXT: pmovmskb %xmm2, %eax +; SSE2-NEXT: por %xmm4, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pmovmskb %xmm0, %eax ; SSE2-NEXT: xorl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF ; SSE2-NEXT: sete %al @@ -980,20 +980,20 @@ ; SSE41-LABEL: mask_v128i8: ; SSE41: # %bb.0: ; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 +; SSE41-NEXT: por %xmm5, %xmm1 +; SSE41-NEXT: por %xmm3, %xmm1 ; SSE41-NEXT: por %xmm6, %xmm2 -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE41-NEXT: por %xmm4, %xmm0 +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: ptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; ; AVX1-LABEL: mask_v128i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX1-NEXT: sete %al @@ -1003,7 +1003,7 @@ ; AVX2-LABEL: mask_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] ; AVX2-NEXT: vptest %ymm1, %ymm0 @@ -1091,11 +1091,11 @@ ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: movd %xmm0, %ecx +; SSE2-NEXT: orl %eax, %ecx ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE2-NEXT: movd %xmm0, %edx -; SSE2-NEXT: orl %ecx, %edx -; SSE2-NEXT: orl %eax, %edx -; SSE2-NEXT: testb $1, %dl +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: orl %ecx, %eax +; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB27_2 ; SSE2-NEXT: # %bb.1: ; SSE2-NEXT: xorl %eax, %eax @@ -1111,10 +1111,10 @@ ; SSE41-NEXT: pxor %xmm0, %xmm1 ; SSE41-NEXT: pextrd $1, %xmm1, %eax ; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: pextrd $2, %xmm1, %edx -; SSE41-NEXT: orl %eax, %edx -; SSE41-NEXT: orl %ecx, %edx -; SSE41-NEXT: testb $1, %dl +; SSE41-NEXT: orl %eax, %ecx +; SSE41-NEXT: pextrd $2, %xmm1, %eax +; SSE41-NEXT: orl %ecx, %eax +; SSE41-NEXT: testb $1, %al ; SSE41-NEXT: je .LBB27_2 ; SSE41-NEXT: # %bb.1: ; SSE41-NEXT: xorl %eax, %eax @@ -1130,10 +1130,10 @@ ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax ; AVX1-NEXT: vmovd %xmm0, %ecx -; AVX1-NEXT: vpextrd $2, %xmm0, %edx -; AVX1-NEXT: orl %eax, %edx -; AVX1-NEXT: orl %ecx, %edx -; AVX1-NEXT: testb $1, %dl +; AVX1-NEXT: orl %eax, %ecx +; AVX1-NEXT: vpextrd $2, %xmm0, %eax +; AVX1-NEXT: orl %ecx, %eax +; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: je .LBB27_2 ; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: xorl %eax, %eax @@ -1149,10 +1149,10 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrd $1, %xmm0, %eax ; AVX2-NEXT: vmovd %xmm0, %ecx -; AVX2-NEXT: vpextrd $2, %xmm0, %edx -; AVX2-NEXT: orl %eax, %edx -; AVX2-NEXT: orl %ecx, %edx -; AVX2-NEXT: testb $1, %dl +; AVX2-NEXT: orl %eax, %ecx +; AVX2-NEXT: vpextrd $2, %xmm0, %eax +; AVX2-NEXT: orl %ecx, %eax +; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: je .LBB27_2 ; AVX2-NEXT: # %bb.1: ; AVX2-NEXT: xorl %eax, %eax diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -74,11 +74,11 @@ ; SSE-LABEL: test_v8i64: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i64: @@ -122,21 +122,21 @@ ; SSE-LABEL: test_v16i64: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -149,7 +149,7 @@ ; AVX2-LABEL: test_v16i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -273,13 +273,13 @@ ; SSE-LABEL: test_v16i32: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i32: @@ -329,14 +329,14 @@ ; SSE-LABEL: test_v32i32: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax @@ -345,7 +345,7 @@ ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -360,7 +360,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -536,16 +536,16 @@ ; SSE-LABEL: test_v32i16: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; @@ -605,14 +605,14 @@ ; SSE-LABEL: test_v64i16: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -625,7 +625,7 @@ ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -643,7 +643,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -870,19 +870,19 @@ ; SSE-LABEL: test_v64i8: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; @@ -948,14 +948,14 @@ ; SSE-LABEL: test_v128i8: ; SSE: # %bb.0: ; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: por %xmm7, %xmm3 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -971,7 +971,7 @@ ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 @@ -991,7 +991,7 @@ ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -931,13 +931,13 @@ ; SSE4-LABEL: test_v16i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxsd %xmm3, %xmm1 -; SSE4-NEXT: pmaxsd %xmm2, %xmm1 -; SSE4-NEXT: pmaxsd %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE4-NEXT: pmaxsd %xmm2, %xmm0 ; SSE4-NEXT: pmaxsd %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 -; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: pmaxsd %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_v16i32: @@ -945,8 +945,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1041,14 +1041,14 @@ ; SSE4-LABEL: test_v32i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxsd %xmm6, %xmm2 -; SSE4-NEXT: pmaxsd %xmm7, %xmm3 -; SSE4-NEXT: pmaxsd %xmm5, %xmm3 -; SSE4-NEXT: pmaxsd %xmm1, %xmm3 -; SSE4-NEXT: pmaxsd %xmm4, %xmm2 -; SSE4-NEXT: pmaxsd %xmm3, %xmm2 -; SSE4-NEXT: pmaxsd %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE4-NEXT: pmaxsd %xmm4, %xmm0 ; SSE4-NEXT: pmaxsd %xmm2, %xmm0 +; SSE4-NEXT: pmaxsd %xmm7, %xmm3 +; SSE4-NEXT: pmaxsd %xmm5, %xmm1 +; SSE4-NEXT: pmaxsd %xmm3, %xmm1 +; SSE4-NEXT: pmaxsd %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE4-NEXT: pmaxsd %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-NEXT: pmaxsd %xmm0, %xmm1 ; SSE4-NEXT: movd %xmm1, %eax @@ -1057,16 +1057,16 @@ ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpmaxsd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsd %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1078,7 +1078,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxsd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 @@ -1288,26 +1288,26 @@ ; SSE2-LABEL: test_v32i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pmaxsw %xmm3, %xmm1 -; SSE2-NEXT: pmaxsw %xmm2, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: pmaxsw %xmm2, %xmm0 ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; ; SSE4-LABEL: test_v32i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxsw %xmm3, %xmm1 -; SSE4-NEXT: pmaxsw %xmm2, %xmm1 -; SSE4-NEXT: pmaxsw %xmm0, %xmm1 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pmaxsw %xmm2, %xmm0 +; SSE4-NEXT: pmaxsw %xmm1, %xmm0 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorl $32767, %eax # imm = 0x7FFF ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1318,8 +1318,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -1362,14 +1362,14 @@ ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pmaxsw %xmm6, %xmm2 -; SSE2-NEXT: pmaxsw %xmm7, %xmm3 -; SSE2-NEXT: pmaxsw %xmm5, %xmm3 -; SSE2-NEXT: pmaxsw %xmm1, %xmm3 -; SSE2-NEXT: pmaxsw %xmm4, %xmm2 -; SSE2-NEXT: pmaxsw %xmm3, %xmm2 -; SSE2-NEXT: pmaxsw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE2-NEXT: pmaxsw %xmm4, %xmm0 ; SSE2-NEXT: pmaxsw %xmm2, %xmm0 +; SSE2-NEXT: pmaxsw %xmm7, %xmm3 +; SSE2-NEXT: pmaxsw %xmm5, %xmm1 +; SSE2-NEXT: pmaxsw %xmm3, %xmm1 +; SSE2-NEXT: pmaxsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: pmaxsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: pmaxsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -1382,14 +1382,14 @@ ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxsw %xmm7, %xmm3 -; SSE4-NEXT: pmaxsw %xmm5, %xmm3 -; SSE4-NEXT: pmaxsw %xmm1, %xmm3 +; SSE4-NEXT: pmaxsw %xmm5, %xmm1 +; SSE4-NEXT: pmaxsw %xmm3, %xmm1 ; SSE4-NEXT: pmaxsw %xmm6, %xmm2 -; SSE4-NEXT: pmaxsw %xmm4, %xmm2 -; SSE4-NEXT: pmaxsw %xmm3, %xmm2 -; SSE4-NEXT: pmaxsw %xmm0, %xmm2 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE4-NEXT: phminposuw %xmm2, %xmm0 +; SSE4-NEXT: pmaxsw %xmm4, %xmm0 +; SSE4-NEXT: pmaxsw %xmm2, %xmm0 +; SSE4-NEXT: pmaxsw %xmm1, %xmm0 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorl $32767, %eax # imm = 0x7FFF ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1401,13 +1401,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpmaxsw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaxsw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -1419,7 +1419,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 @@ -1836,13 +1836,13 @@ ; SSE4-LABEL: test_v64i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxsb %xmm3, %xmm1 -; SSE4-NEXT: pmaxsb %xmm2, %xmm1 -; SSE4-NEXT: pmaxsb %xmm0, %xmm1 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: psrlw $8, %xmm0 -; SSE4-NEXT: pminub %xmm1, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pmaxsb %xmm2, %xmm0 +; SSE4-NEXT: pmaxsb %xmm1, %xmm0 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrlw $8, %xmm1 +; SSE4-NEXT: pminub %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorb $127, %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -1853,8 +1853,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxsb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 @@ -1970,17 +1970,17 @@ ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxsb %xmm7, %xmm3 -; SSE4-NEXT: pmaxsb %xmm5, %xmm3 -; SSE4-NEXT: pmaxsb %xmm1, %xmm3 +; SSE4-NEXT: pmaxsb %xmm5, %xmm1 +; SSE4-NEXT: pmaxsb %xmm3, %xmm1 ; SSE4-NEXT: pmaxsb %xmm6, %xmm2 -; SSE4-NEXT: pmaxsb %xmm4, %xmm2 -; SSE4-NEXT: pmaxsb %xmm3, %xmm2 -; SSE4-NEXT: pmaxsb %xmm0, %xmm2 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE4-NEXT: movdqa %xmm2, %xmm0 -; SSE4-NEXT: psrlw $8, %xmm0 -; SSE4-NEXT: pminub %xmm2, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pmaxsb %xmm4, %xmm0 +; SSE4-NEXT: pmaxsb %xmm2, %xmm0 +; SSE4-NEXT: pmaxsb %xmm1, %xmm0 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrlw $8, %xmm1 +; SSE4-NEXT: pminub %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorb $127, %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -1992,13 +1992,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpmaxsb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaxsb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxsb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 @@ -2012,7 +2012,7 @@ ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxsb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxsb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -931,13 +931,13 @@ ; SSE4-LABEL: test_v16i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pminsd %xmm3, %xmm1 -; SSE4-NEXT: pminsd %xmm2, %xmm1 -; SSE4-NEXT: pminsd %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE4-NEXT: pminsd %xmm2, %xmm0 ; SSE4-NEXT: pminsd %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pminsd %xmm0, %xmm1 -; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: pminsd %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_v16i32: @@ -945,8 +945,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1041,14 +1041,14 @@ ; SSE4-LABEL: test_v32i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pminsd %xmm6, %xmm2 -; SSE4-NEXT: pminsd %xmm7, %xmm3 -; SSE4-NEXT: pminsd %xmm5, %xmm3 -; SSE4-NEXT: pminsd %xmm1, %xmm3 -; SSE4-NEXT: pminsd %xmm4, %xmm2 -; SSE4-NEXT: pminsd %xmm3, %xmm2 -; SSE4-NEXT: pminsd %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE4-NEXT: pminsd %xmm4, %xmm0 ; SSE4-NEXT: pminsd %xmm2, %xmm0 +; SSE4-NEXT: pminsd %xmm7, %xmm3 +; SSE4-NEXT: pminsd %xmm5, %xmm1 +; SSE4-NEXT: pminsd %xmm3, %xmm1 +; SSE4-NEXT: pminsd %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE4-NEXT: pminsd %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-NEXT: pminsd %xmm0, %xmm1 ; SSE4-NEXT: movd %xmm1, %eax @@ -1057,16 +1057,16 @@ ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpminsd %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpminsd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpminsd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminsd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminsd %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1078,7 +1078,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpminsd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsd %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminsd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 @@ -1288,26 +1288,26 @@ ; SSE2-LABEL: test_v32i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pminsw %xmm3, %xmm1 -; SSE2-NEXT: pminsw %xmm2, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: pminsw %xmm2, %xmm0 ; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; ; SSE4-LABEL: test_v32i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pminsw %xmm3, %xmm1 -; SSE4-NEXT: pminsw %xmm2, %xmm1 -; SSE4-NEXT: pminsw %xmm0, %xmm1 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pminsw %xmm2, %xmm0 +; SSE4-NEXT: pminsw %xmm1, %xmm0 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1318,8 +1318,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminsw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -1362,14 +1362,14 @@ ; SSE2-LABEL: test_v64i16: ; SSE2: # %bb.0: ; SSE2-NEXT: pminsw %xmm6, %xmm2 -; SSE2-NEXT: pminsw %xmm7, %xmm3 -; SSE2-NEXT: pminsw %xmm5, %xmm3 -; SSE2-NEXT: pminsw %xmm1, %xmm3 -; SSE2-NEXT: pminsw %xmm4, %xmm2 -; SSE2-NEXT: pminsw %xmm3, %xmm2 -; SSE2-NEXT: pminsw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE2-NEXT: pminsw %xmm4, %xmm0 ; SSE2-NEXT: pminsw %xmm2, %xmm0 +; SSE2-NEXT: pminsw %xmm7, %xmm3 +; SSE2-NEXT: pminsw %xmm5, %xmm1 +; SSE2-NEXT: pminsw %xmm3, %xmm1 +; SSE2-NEXT: pminsw %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: pminsw %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: pminsw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -1382,14 +1382,14 @@ ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pminsw %xmm7, %xmm3 -; SSE4-NEXT: pminsw %xmm5, %xmm3 -; SSE4-NEXT: pminsw %xmm1, %xmm3 +; SSE4-NEXT: pminsw %xmm5, %xmm1 +; SSE4-NEXT: pminsw %xmm3, %xmm1 ; SSE4-NEXT: pminsw %xmm6, %xmm2 -; SSE4-NEXT: pminsw %xmm4, %xmm2 -; SSE4-NEXT: pminsw %xmm3, %xmm2 -; SSE4-NEXT: pminsw %xmm0, %xmm2 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE4-NEXT: phminposuw %xmm2, %xmm0 +; SSE4-NEXT: pminsw %xmm4, %xmm0 +; SSE4-NEXT: pminsw %xmm2, %xmm0 +; SSE4-NEXT: pminsw %xmm1, %xmm0 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: xorl $32768, %eax # imm = 0x8000 ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1401,13 +1401,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpminsw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminsw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminsw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpminsw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminsw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax @@ -1419,7 +1419,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpminsw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminsw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 @@ -1836,13 +1836,13 @@ ; SSE4-LABEL: test_v64i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pminsb %xmm3, %xmm1 -; SSE4-NEXT: pminsb %xmm2, %xmm1 -; SSE4-NEXT: pminsb %xmm0, %xmm1 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: psrlw $8, %xmm0 -; SSE4-NEXT: pminub %xmm1, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pminsb %xmm2, %xmm0 +; SSE4-NEXT: pminsb %xmm1, %xmm0 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrlw $8, %xmm1 +; SSE4-NEXT: pminub %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: addb $-128, %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -1853,8 +1853,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminsb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 @@ -1970,17 +1970,17 @@ ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pminsb %xmm7, %xmm3 -; SSE4-NEXT: pminsb %xmm5, %xmm3 -; SSE4-NEXT: pminsb %xmm1, %xmm3 +; SSE4-NEXT: pminsb %xmm5, %xmm1 +; SSE4-NEXT: pminsb %xmm3, %xmm1 ; SSE4-NEXT: pminsb %xmm6, %xmm2 -; SSE4-NEXT: pminsb %xmm4, %xmm2 -; SSE4-NEXT: pminsb %xmm3, %xmm2 -; SSE4-NEXT: pminsb %xmm0, %xmm2 -; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE4-NEXT: movdqa %xmm2, %xmm0 -; SSE4-NEXT: psrlw $8, %xmm0 -; SSE4-NEXT: pminub %xmm2, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pminsb %xmm4, %xmm0 +; SSE4-NEXT: pminsb %xmm2, %xmm0 +; SSE4-NEXT: pminsb %xmm1, %xmm0 +; SSE4-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrlw $8, %xmm1 +; SSE4-NEXT: pminub %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: addb $-128, %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -1992,13 +1992,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpminsb %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminsb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminsb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpminsb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminsb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 @@ -2012,7 +2012,7 @@ ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpminsb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminsb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -1065,13 +1065,13 @@ ; SSE4-LABEL: test_v16i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxud %xmm3, %xmm1 -; SSE4-NEXT: pmaxud %xmm2, %xmm1 -; SSE4-NEXT: pmaxud %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE4-NEXT: pmaxud %xmm2, %xmm0 ; SSE4-NEXT: pmaxud %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pmaxud %xmm0, %xmm1 -; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: pmaxud %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_v16i32: @@ -1079,8 +1079,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1202,14 +1202,14 @@ ; SSE4-LABEL: test_v32i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxud %xmm6, %xmm2 -; SSE4-NEXT: pmaxud %xmm7, %xmm3 -; SSE4-NEXT: pmaxud %xmm5, %xmm3 -; SSE4-NEXT: pmaxud %xmm1, %xmm3 -; SSE4-NEXT: pmaxud %xmm4, %xmm2 -; SSE4-NEXT: pmaxud %xmm3, %xmm2 -; SSE4-NEXT: pmaxud %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE4-NEXT: pmaxud %xmm4, %xmm0 ; SSE4-NEXT: pmaxud %xmm2, %xmm0 +; SSE4-NEXT: pmaxud %xmm7, %xmm3 +; SSE4-NEXT: pmaxud %xmm5, %xmm1 +; SSE4-NEXT: pmaxud %xmm3, %xmm1 +; SSE4-NEXT: pmaxud %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE4-NEXT: pmaxud %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-NEXT: pmaxud %xmm0, %xmm1 ; SSE4-NEXT: movd %xmm1, %eax @@ -1218,16 +1218,16 @@ ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpmaxud %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmaxud %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmaxud %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpmaxud %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxud %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1239,7 +1239,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxud %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 @@ -1529,11 +1529,11 @@ ; SSE4-LABEL: test_v32i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxuw %xmm3, %xmm1 -; SSE4-NEXT: pmaxuw %xmm2, %xmm1 -; SSE4-NEXT: pmaxuw %xmm0, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pmaxuw %xmm2, %xmm0 +; SSE4-NEXT: pmaxuw %xmm1, %xmm0 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: notl %eax ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1544,8 +1544,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -1634,15 +1634,15 @@ ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxuw %xmm7, %xmm3 -; SSE4-NEXT: pmaxuw %xmm5, %xmm3 -; SSE4-NEXT: pmaxuw %xmm1, %xmm3 +; SSE4-NEXT: pmaxuw %xmm5, %xmm1 +; SSE4-NEXT: pmaxuw %xmm3, %xmm1 ; SSE4-NEXT: pmaxuw %xmm6, %xmm2 -; SSE4-NEXT: pmaxuw %xmm4, %xmm2 -; SSE4-NEXT: pmaxuw %xmm3, %xmm2 -; SSE4-NEXT: pmaxuw %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pmaxuw %xmm4, %xmm0 +; SSE4-NEXT: pmaxuw %xmm2, %xmm0 +; SSE4-NEXT: pmaxuw %xmm1, %xmm0 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: notl %eax ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax @@ -1654,13 +1654,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpmaxuw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaxuw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaxuw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxuw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -1673,7 +1673,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxuw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxuw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxuw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 @@ -1996,33 +1996,33 @@ ; SSE2-LABEL: test_v64i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pmaxub %xmm3, %xmm1 -; SSE2-NEXT: pmaxub %xmm2, %xmm1 -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: pmaxub %xmm2, %xmm0 ; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: pmaxub %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 ; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pmaxub %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE4-LABEL: test_v64i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxub %xmm3, %xmm1 -; SSE4-NEXT: pmaxub %xmm2, %xmm1 -; SSE4-NEXT: pmaxub %xmm0, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm1, %xmm0 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: psrlw $8, %xmm1 -; SSE4-NEXT: pminub %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pmaxub %xmm2, %xmm0 +; SSE4-NEXT: pmaxub %xmm1, %xmm0 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm0, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: pminub %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: notb %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -2033,8 +2033,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -2101,14 +2101,14 @@ ; SSE2-LABEL: test_v128i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pmaxub %xmm6, %xmm2 -; SSE2-NEXT: pmaxub %xmm7, %xmm3 -; SSE2-NEXT: pmaxub %xmm5, %xmm3 -; SSE2-NEXT: pmaxub %xmm1, %xmm3 -; SSE2-NEXT: pmaxub %xmm4, %xmm2 -; SSE2-NEXT: pmaxub %xmm3, %xmm2 -; SSE2-NEXT: pmaxub %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE2-NEXT: pmaxub %xmm4, %xmm0 ; SSE2-NEXT: pmaxub %xmm2, %xmm0 +; SSE2-NEXT: pmaxub %xmm7, %xmm3 +; SSE2-NEXT: pmaxub %xmm5, %xmm1 +; SSE2-NEXT: pmaxub %xmm3, %xmm1 +; SSE2-NEXT: pmaxub %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: pmaxub %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: pmaxub %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -2124,18 +2124,18 @@ ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pmaxub %xmm7, %xmm3 -; SSE4-NEXT: pmaxub %xmm5, %xmm3 -; SSE4-NEXT: pmaxub %xmm1, %xmm3 +; SSE4-NEXT: pmaxub %xmm5, %xmm1 +; SSE4-NEXT: pmaxub %xmm3, %xmm1 ; SSE4-NEXT: pmaxub %xmm6, %xmm2 -; SSE4-NEXT: pmaxub %xmm4, %xmm2 -; SSE4-NEXT: pmaxub %xmm3, %xmm2 -; SSE4-NEXT: pmaxub %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm2, %xmm0 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: psrlw $8, %xmm1 -; SSE4-NEXT: pminub %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pmaxub %xmm4, %xmm0 +; SSE4-NEXT: pmaxub %xmm2, %xmm0 +; SSE4-NEXT: pmaxub %xmm1, %xmm0 +; SSE4-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm0, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: pminub %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: notb %al ; SSE4-NEXT: # kill: def $al killed $al killed $eax @@ -2147,13 +2147,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpmaxub %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpmaxub %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxub %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaxub %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmaxub %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 @@ -2168,7 +2168,7 @@ ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmaxub %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpmaxub %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -1069,13 +1069,13 @@ ; SSE4-LABEL: test_v16i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pminud %xmm3, %xmm1 -; SSE4-NEXT: pminud %xmm2, %xmm1 -; SSE4-NEXT: pminud %xmm0, %xmm1 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE4-NEXT: pminud %xmm2, %xmm0 ; SSE4-NEXT: pminud %xmm1, %xmm0 -; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE4-NEXT: pminud %xmm0, %xmm1 -; SSE4-NEXT: movd %xmm1, %eax +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE4-NEXT: pminud %xmm1, %xmm0 +; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: retq ; ; AVX1-LABEL: test_v16i32: @@ -1083,8 +1083,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1206,14 +1206,14 @@ ; SSE4-LABEL: test_v32i32: ; SSE4: # %bb.0: ; SSE4-NEXT: pminud %xmm6, %xmm2 -; SSE4-NEXT: pminud %xmm7, %xmm3 -; SSE4-NEXT: pminud %xmm5, %xmm3 -; SSE4-NEXT: pminud %xmm1, %xmm3 -; SSE4-NEXT: pminud %xmm4, %xmm2 -; SSE4-NEXT: pminud %xmm3, %xmm2 -; SSE4-NEXT: pminud %xmm0, %xmm2 -; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE4-NEXT: pminud %xmm4, %xmm0 ; SSE4-NEXT: pminud %xmm2, %xmm0 +; SSE4-NEXT: pminud %xmm7, %xmm3 +; SSE4-NEXT: pminud %xmm5, %xmm1 +; SSE4-NEXT: pminud %xmm3, %xmm1 +; SSE4-NEXT: pminud %xmm0, %xmm1 +; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE4-NEXT: pminud %xmm1, %xmm0 ; SSE4-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE4-NEXT: pminud %xmm0, %xmm1 ; SSE4-NEXT: movd %xmm1, %eax @@ -1222,16 +1222,16 @@ ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpminud %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminud %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] @@ -1243,7 +1243,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 @@ -1507,9 +1507,9 @@ ; SSE4-LABEL: test_v32i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pminuw %xmm3, %xmm1 -; SSE4-NEXT: pminuw %xmm2, %xmm1 -; SSE4-NEXT: pminuw %xmm0, %xmm1 -; SSE4-NEXT: phminposuw %xmm1, %xmm0 +; SSE4-NEXT: pminuw %xmm2, %xmm0 +; SSE4-NEXT: pminuw %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax ; SSE4-NEXT: retq @@ -1519,8 +1519,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax @@ -1597,13 +1597,13 @@ ; SSE4-LABEL: test_v64i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pminuw %xmm7, %xmm3 -; SSE4-NEXT: pminuw %xmm5, %xmm3 -; SSE4-NEXT: pminuw %xmm1, %xmm3 +; SSE4-NEXT: pminuw %xmm5, %xmm1 +; SSE4-NEXT: pminuw %xmm3, %xmm1 ; SSE4-NEXT: pminuw %xmm6, %xmm2 -; SSE4-NEXT: pminuw %xmm4, %xmm2 -; SSE4-NEXT: pminuw %xmm3, %xmm2 -; SSE4-NEXT: pminuw %xmm0, %xmm2 -; SSE4-NEXT: phminposuw %xmm2, %xmm0 +; SSE4-NEXT: pminuw %xmm4, %xmm0 +; SSE4-NEXT: pminuw %xmm2, %xmm0 +; SSE4-NEXT: pminuw %xmm1, %xmm0 +; SSE4-NEXT: phminposuw %xmm0, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: # kill: def $ax killed $ax killed $eax ; SSE4-NEXT: retq @@ -1614,13 +1614,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpminuw %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminuw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminuw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: # kill: def $ax killed $ax killed $eax @@ -1630,7 +1630,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminuw %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 @@ -1887,31 +1887,31 @@ ; SSE2-LABEL: test_v64i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pminub %xmm3, %xmm1 -; SSE2-NEXT: pminub %xmm2, %xmm1 -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: pminub %xmm2, %xmm0 ; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: pminub %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 ; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psrlw $8, %xmm0 +; SSE2-NEXT: pminub %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: # kill: def $al killed $al killed $eax ; SSE2-NEXT: retq ; ; SSE4-LABEL: test_v64i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pminub %xmm3, %xmm1 -; SSE4-NEXT: pminub %xmm2, %xmm1 -; SSE4-NEXT: pminub %xmm0, %xmm1 -; SSE4-NEXT: movdqa %xmm1, %xmm0 -; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: pminub %xmm2, %xmm0 ; SSE4-NEXT: pminub %xmm1, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrlw $8, %xmm1 +; SSE4-NEXT: pminub %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: # kill: def $al killed $al killed $eax ; SSE4-NEXT: retq @@ -1921,8 +1921,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -1965,14 +1965,14 @@ ; SSE2-LABEL: test_v128i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pminub %xmm6, %xmm2 -; SSE2-NEXT: pminub %xmm7, %xmm3 -; SSE2-NEXT: pminub %xmm5, %xmm3 -; SSE2-NEXT: pminub %xmm1, %xmm3 -; SSE2-NEXT: pminub %xmm4, %xmm2 -; SSE2-NEXT: pminub %xmm3, %xmm2 -; SSE2-NEXT: pminub %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE2-NEXT: pminub %xmm4, %xmm0 ; SSE2-NEXT: pminub %xmm2, %xmm0 +; SSE2-NEXT: pminub %xmm7, %xmm3 +; SSE2-NEXT: pminub %xmm5, %xmm1 +; SSE2-NEXT: pminub %xmm3, %xmm1 +; SSE2-NEXT: pminub %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE2-NEXT: pminub %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE2-NEXT: pminub %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 @@ -1988,16 +1988,16 @@ ; SSE4-LABEL: test_v128i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pminub %xmm7, %xmm3 -; SSE4-NEXT: pminub %xmm5, %xmm3 -; SSE4-NEXT: pminub %xmm1, %xmm3 +; SSE4-NEXT: pminub %xmm5, %xmm1 +; SSE4-NEXT: pminub %xmm3, %xmm1 ; SSE4-NEXT: pminub %xmm6, %xmm2 -; SSE4-NEXT: pminub %xmm4, %xmm2 -; SSE4-NEXT: pminub %xmm3, %xmm2 -; SSE4-NEXT: pminub %xmm0, %xmm2 -; SSE4-NEXT: movdqa %xmm2, %xmm0 -; SSE4-NEXT: psrlw $8, %xmm0 +; SSE4-NEXT: pminub %xmm4, %xmm0 ; SSE4-NEXT: pminub %xmm2, %xmm0 -; SSE4-NEXT: phminposuw %xmm0, %xmm0 +; SSE4-NEXT: pminub %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm0, %xmm1 +; SSE4-NEXT: psrlw $8, %xmm1 +; SSE4-NEXT: pminub %xmm0, %xmm1 +; SSE4-NEXT: phminposuw %xmm1, %xmm0 ; SSE4-NEXT: movd %xmm0, %eax ; SSE4-NEXT: # kill: def $al killed $al killed $eax ; SSE4-NEXT: retq @@ -2008,13 +2008,13 @@ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpminub %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminub %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminub %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 ; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vphminposuw %xmm0, %xmm0 @@ -2026,7 +2026,7 @@ ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpminub %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminub %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpminub %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -805,10 +805,10 @@ ; SSE-LABEL: trunc_v64i8_v64i1: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: pmovmskb %xmm1, %eax +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: psllw $7, %xmm0 +; SSE-NEXT: pmovmskb %xmm0, %eax ; SSE-NEXT: xorb %ah, %al ; SSE-NEXT: setnp %al ; SSE-NEXT: retq @@ -844,8 +844,8 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpxor %xmm2, %xmm3, %xmm2 -; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -1737,10 +1737,10 @@ ; SSE-NEXT: pxor %xmm4, %xmm4 ; SSE-NEXT: pcmpeqb %xmm4, %xmm2 ; SSE-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pcmpeqb %xmm4, %xmm3 ; SSE-NEXT: pcmpeqb %xmm4, %xmm1 ; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pmovmskb %xmm1, %eax ; SSE-NEXT: xorb %ah, %al @@ -1752,13 +1752,13 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: xorb %ah, %al ; AVX1-NEXT: setnp %al @@ -1789,8 +1789,8 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpxor %xmm2, %xmm3, %xmm2 -; AVX512F-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -2664,10 +2664,10 @@ ; SSE: # %bb.0: ; SSE-NEXT: pcmpeqb %xmm6, %xmm2 ; SSE-NEXT: pcmpeqb %xmm4, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pcmpeqb %xmm7, %xmm3 ; SSE-NEXT: pcmpeqb %xmm5, %xmm1 ; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: pmovmskb %xmm1, %eax ; SSE-NEXT: xorb %ah, %al @@ -2678,6 +2678,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1 @@ -2686,7 +2687,6 @@ ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpmovmskb %xmm0, %eax ; AVX1-NEXT: xorb %ah, %al ; AVX1-NEXT: setnp %al @@ -2716,7 +2716,7 @@ ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm1 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX512F-NEXT: vpxor %xmm1, %xmm3, %xmm1 -; AVX512F-NEXT: vpxor %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -74,11 +74,11 @@ ; SSE-LABEL: test_v8i64: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v8i64: @@ -122,21 +122,21 @@ ; SSE-LABEL: test_v16i64: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -149,7 +149,7 @@ ; AVX2-LABEL: test_v16i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -273,13 +273,13 @@ ; SSE-LABEL: test_v16i32: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i32: @@ -329,14 +329,14 @@ ; SSE-LABEL: test_v32i32: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: movd %xmm1, %eax @@ -345,7 +345,7 @@ ; AVX1-LABEL: test_v32i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -360,7 +360,7 @@ ; AVX2-LABEL: test_v32i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -536,16 +536,16 @@ ; SSE-LABEL: test_v32i16: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: movd %xmm1, %eax ; SSE-NEXT: # kill: def $ax killed $ax killed $eax ; SSE-NEXT: retq ; @@ -605,14 +605,14 @@ ; SSE-LABEL: test_v64i16: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -625,7 +625,7 @@ ; AVX1-LABEL: test_v64i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -643,7 +643,7 @@ ; AVX2-LABEL: test_v64i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -870,19 +870,19 @@ ; SSE-LABEL: test_v64i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlw $8, %xmm1 +; SSE-NEXT: psrld $16, %xmm1 ; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: psrlw $8, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: movd %xmm0, %eax ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; @@ -948,14 +948,14 @@ ; SSE-LABEL: test_v128i8: ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3] +; SSE-NEXT: pxor %xmm4, %xmm0 ; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm7, %xmm3 +; SSE-NEXT: pxor %xmm5, %xmm1 +; SSE-NEXT: pxor %xmm3, %xmm1 +; SSE-NEXT: pxor %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; SSE-NEXT: pxor %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 @@ -971,7 +971,7 @@ ; AVX1-LABEL: test_v128i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 @@ -991,7 +991,7 @@ ; AVX2-LABEL: test_v128i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -2615,12 +2615,12 @@ ; ; AVX1-LABEL: trunc_and_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 @@ -2726,22 +2726,22 @@ ; ; AVX1-LABEL: trunc_and_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm7, %ymm8, %ymm7 -; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vandps %ymm6, %ymm8, %ymm6 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm3 -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm8, %ymm3 -; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 @@ -2751,17 +2751,17 @@ ; ; AVX2-LABEL: trunc_and_v16i64_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [255,255,255,255] -; AVX2-NEXT: vpand %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpand %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] +; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] -; AVX2-NEXT: vpand %ymm5, %ymm8, %ymm3 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm8, %ymm3 -; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 @@ -2788,28 +2788,28 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; SSE-LABEL: trunc_and_v16i32_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pand %xmm4, %xmm8 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: packuswb %xmm6, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm6, %xmm2 +; SSE-NEXT: pand %xmm7, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE-NEXT: pand %xmm4, %xmm3 +; SSE-NEXT: pand %xmm4, %xmm2 +; SSE-NEXT: packuswb %xmm3, %xmm2 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 @@ -2819,10 +2819,10 @@ ; ; AVX2-LABEL: trunc_and_v16i32_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -2845,12 +2845,12 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { ; SSE-LABEL: trunc_and_v16i16_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm3, %xmm0 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: trunc_and_v16i16_v16i8: diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -3020,24 +3020,24 @@ ; SSE2-LABEL: trunc_packus_v4i64_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] ; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 @@ -3049,26 +3049,26 @@ ; SSE2-NEXT: pandn %xmm2, %xmm5 ; SSE2-NEXT: por %xmm5, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm1, %xmm4 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: packuswb %xmm3, %xmm0 ; SSE2-NEXT: packuswb %xmm0, %xmm0 @@ -3276,24 +3276,24 @@ ; SSE2-LABEL: trunc_packus_v4i64_v4i8_store: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] ; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm8 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm8 ; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] ; SSE2-NEXT: pand %xmm5, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,3,3] -; SSE2-NEXT: por %xmm9, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,3,3] +; SSE2-NEXT: por %xmm9, %xmm3 +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] ; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 @@ -3305,31 +3305,31 @@ ; SSE2-NEXT: pandn %xmm2, %xmm1 ; SSE2-NEXT: por %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm5, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] ; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm4 ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: packuswb %xmm3, %xmm5 -; SSE2-NEXT: packuswb %xmm5, %xmm5 -; SSE2-NEXT: packuswb %xmm5, %xmm5 -; SSE2-NEXT: movd %xmm5, (%rdi) +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pand %xmm5, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: packuswb %xmm1, %xmm1 +; SSE2-NEXT: movd %xmm1, (%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: trunc_packus_v4i64_v4i8_store: diff --git a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll --- a/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll +++ b/llvm/test/CodeGen/X86/vp2intersect_multiple_pairs.ll @@ -46,16 +46,16 @@ ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload ; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload ; X86-NEXT: kmovw %k0, %edi -; X86-NEXT: addl %eax, %ecx -; X86-NEXT: kmovw %k1, %eax -; X86-NEXT: addl %edx, %eax -; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k0 # 2-byte Reload -; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 # 2-byte Reload -; X86-NEXT: kmovw %k0, %edx -; X86-NEXT: addl %eax, %edx +; X86-NEXT: addl %edi, %eax +; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 # 2-byte Reload +; X86-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 # 2-byte Reload +; X86-NEXT: kmovw %k2, %edi ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: addl %edi, %edx -; X86-NEXT: movw %dx, (%esi) +; X86-NEXT: kmovw %k1, %ecx +; X86-NEXT: addl %edi, %ecx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: movw %ax, (%esi) ; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -104,12 +104,12 @@ ; X64-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; X64-NEXT: kmovw %k0, %edi ; X64-NEXT: kmovw %k1, %r8d -; X64-NEXT: addl %ecx, %edx -; X64-NEXT: addl %r8d, %eax -; X64-NEXT: addl %esi, %eax -; X64-NEXT: addl %edx, %eax ; X64-NEXT: addl %edi, %eax -; X64-NEXT: movw %ax, (%rbx) +; X64-NEXT: addl %ecx, %edx +; X64-NEXT: addl %eax, %edx +; X64-NEXT: addl %r8d, %edx +; X64-NEXT: addl %esi, %edx +; X64-NEXT: movw %dx, (%rbx) ; X64-NEXT: leaq -8(%rbp), %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %rbp diff --git a/llvm/test/CodeGen/X86/win-smallparams.ll b/llvm/test/CodeGen/X86/win-smallparams.ll --- a/llvm/test/CodeGen/X86/win-smallparams.ll +++ b/llvm/test/CodeGen/X86/win-smallparams.ll @@ -65,57 +65,51 @@ ; WIN64: # %bb.0: # %entry ; WIN64-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; WIN64-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; WIN64-NEXT: movsbl %cl, %ecx -; WIN64-NEXT: movswl %dx, %eax +; WIN64-NEXT: movsbl %cl, %eax +; WIN64-NEXT: movswl %dx, %ecx +; WIN64-NEXT: addl %eax, %ecx ; WIN64-NEXT: movzbl %r8b, %edx -; WIN64-NEXT: addl %eax, %edx ; WIN64-NEXT: movzwl %r9w, %eax ; WIN64-NEXT: addl %edx, %eax +; WIN64-NEXT: addl %ecx, %eax ; WIN64-NEXT: addl %r11d, %eax ; WIN64-NEXT: addl %r10d, %eax -; WIN64-NEXT: addl %ecx, %eax ; WIN64-NEXT: retq ; ; WIN32-MSVC-LABEL: manyargs: ; WIN32-MSVC: # %bb.0: # %entry -; WIN32-MSVC-NEXT: pushl %edi ; WIN32-MSVC-NEXT: pushl %esi ; WIN32-MSVC-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-MSVC-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; WIN32-MSVC-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; WIN32-MSVC-NEXT: movzbl {{[0-9]+}}(%esp), %esi -; WIN32-MSVC-NEXT: movswl {{[0-9]+}}(%esp), %edi -; WIN32-MSVC-NEXT: addl %esi, %edi -; WIN32-MSVC-NEXT: addl %edx, %edi -; WIN32-MSVC-NEXT: addl %ecx, %edi -; WIN32-MSVC-NEXT: addl %eax, %edi +; WIN32-MSVC-NEXT: addl %eax, %ecx +; WIN32-MSVC-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; WIN32-MSVC-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; WIN32-MSVC-NEXT: addl %eax, %edx +; WIN32-MSVC-NEXT: movswl {{[0-9]+}}(%esp), %esi ; WIN32-MSVC-NEXT: movsbl {{[0-9]+}}(%esp), %eax -; WIN32-MSVC-NEXT: addl %edi, %eax +; WIN32-MSVC-NEXT: addl %esi, %eax +; WIN32-MSVC-NEXT: addl %edx, %eax +; WIN32-MSVC-NEXT: addl %ecx, %eax ; WIN32-MSVC-NEXT: popl %esi -; WIN32-MSVC-NEXT: popl %edi ; WIN32-MSVC-NEXT: retl ; ; WIN32-GNU-LABEL: manyargs: ; WIN32-GNU: # %bb.0: # %entry -; WIN32-GNU-NEXT: pushl %edi -; WIN32-GNU-NEXT: .cfi_def_cfa_offset 8 ; WIN32-GNU-NEXT: pushl %esi -; WIN32-GNU-NEXT: .cfi_def_cfa_offset 12 -; WIN32-GNU-NEXT: .cfi_offset %esi, -12 -; WIN32-GNU-NEXT: .cfi_offset %edi, -8 +; WIN32-GNU-NEXT: .cfi_def_cfa_offset 8 +; WIN32-GNU-NEXT: .cfi_offset %esi, -8 ; WIN32-GNU-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-GNU-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; WIN32-GNU-NEXT: movzwl {{[0-9]+}}(%esp), %edx -; WIN32-GNU-NEXT: movzbl {{[0-9]+}}(%esp), %esi -; WIN32-GNU-NEXT: movswl {{[0-9]+}}(%esp), %edi -; WIN32-GNU-NEXT: addl %esi, %edi -; WIN32-GNU-NEXT: addl %edx, %edi -; WIN32-GNU-NEXT: addl %ecx, %edi -; WIN32-GNU-NEXT: addl %eax, %edi +; WIN32-GNU-NEXT: addl %eax, %ecx +; WIN32-GNU-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; WIN32-GNU-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; WIN32-GNU-NEXT: addl %eax, %edx +; WIN32-GNU-NEXT: movswl {{[0-9]+}}(%esp), %esi ; WIN32-GNU-NEXT: movsbl {{[0-9]+}}(%esp), %eax -; WIN32-GNU-NEXT: addl %edi, %eax +; WIN32-GNU-NEXT: addl %esi, %eax +; WIN32-GNU-NEXT: addl %edx, %eax +; WIN32-GNU-NEXT: addl %ecx, %eax ; WIN32-GNU-NEXT: popl %esi -; WIN32-GNU-NEXT: popl %edi ; WIN32-GNU-NEXT: retl entry: %aa = sext i8 %a to i32 diff --git a/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll b/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll --- a/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll +++ b/llvm/test/CodeGen/X86/x86-32-vector-calling-conv.ll @@ -5,16 +5,16 @@ define <4 x i32> @test_sse(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind { ; DARWIN-LABEL: test_sse: ; DARWIN: ## %bb.0: -; DARWIN-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; DARWIN-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; DARWIN-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; DARWIN-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ; DARWIN-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; DARWIN-NEXT: retl ; ; LINUX-LABEL: test_sse: ; LINUX: # %bb.0: ; LINUX-NEXT: subl $12, %esp -; LINUX-NEXT: vpaddd {{[0-9]+}}(%esp), %xmm2, %xmm2 -; LINUX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; LINUX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; LINUX-NEXT: vpaddd {{[0-9]+}}(%esp), %xmm2, %xmm1 ; LINUX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; LINUX-NEXT: addl $12, %esp ; LINUX-NEXT: retl @@ -27,8 +27,8 @@ define <8 x i32> @test_avx(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) nounwind { ; DARWIN-LABEL: test_avx: ; DARWIN: ## %bb.0: -; DARWIN-NEXT: vpaddd %ymm3, %ymm2, %ymm2 -; DARWIN-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; DARWIN-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; DARWIN-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ; DARWIN-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; DARWIN-NEXT: retl ; @@ -38,8 +38,8 @@ ; LINUX-NEXT: movl %esp, %ebp ; LINUX-NEXT: andl $-32, %esp ; LINUX-NEXT: subl $32, %esp -; LINUX-NEXT: vpaddd 8(%ebp), %ymm2, %ymm2 -; LINUX-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; LINUX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; LINUX-NEXT: vpaddd 8(%ebp), %ymm2, %ymm1 ; LINUX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; LINUX-NEXT: movl %ebp, %esp ; LINUX-NEXT: popl %ebp @@ -53,8 +53,8 @@ define <16 x i32> @test_avx512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i32> %d) nounwind { ; DARWIN-LABEL: test_avx512: ; DARWIN: ## %bb.0: -; DARWIN-NEXT: vpaddd %zmm3, %zmm2, %zmm2 -; DARWIN-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; DARWIN-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; DARWIN-NEXT: vpaddd %zmm3, %zmm2, %zmm1 ; DARWIN-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; DARWIN-NEXT: retl ; @@ -64,8 +64,8 @@ ; LINUX-NEXT: movl %esp, %ebp ; LINUX-NEXT: andl $-64, %esp ; LINUX-NEXT: subl $64, %esp -; LINUX-NEXT: vpaddd 8(%ebp), %zmm2, %zmm2 -; LINUX-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; LINUX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; LINUX-NEXT: vpaddd 8(%ebp), %zmm2, %zmm1 ; LINUX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; LINUX-NEXT: movl %ebp, %esp ; LINUX-NEXT: popl %ebp diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -123,14 +123,14 @@ ; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 -; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm0, %xmm5, %xmm0 ; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -147,7 +147,7 @@ ; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] -; AVX2OR512-NEXT: vpaddq %ymm3, %ymm4, %ymm3 +; AVX2OR512-NEXT: vpaddq %ymm4, %ymm2, %ymm2 ; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2OR512-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2OR512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 @@ -812,14 +812,14 @@ ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2 ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] +; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpaddb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpaddb %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3: @@ -895,10 +895,10 @@ ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %wide.vec = load <24 x i8>, ptr %ptr @@ -1270,27 +1270,27 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX1-LABEL: interleaved_load_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm8 +; AVX1-NEXT: vmovdqu (%rdi), %xmm9 ; AVX1-NEXT: vmovups 16(%rdi), %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqu 48(%rdi), %xmm10 ; AVX1-NEXT: vmovdqu 64(%rdi), %xmm3 ; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4 -; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5 +; AVX1-NEXT: vmovdqu 96(%rdi), %xmm6 ; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2 ; AVX1-NEXT: vmovdqu 144(%rdi), %xmm12 ; AVX1-NEXT: vmovdqu 160(%rdi), %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm6 +; AVX1-NEXT: vpshufb %xmm11, %xmm6, %xmm5 ; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm7 -; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm9 +; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm8 ; AVX1-NEXT: vpshufb %xmm11, %xmm10, %xmm11 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm13, %xmm6, %xmm6 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u> ; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm15 ; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpor %xmm5, %xmm15, %xmm0 +; AVX1-NEXT: vpor %xmm6, %xmm15, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm12 ; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15 @@ -1298,21 +1298,21 @@ ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm9 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm15 -; AVX1-NEXT: vpor %xmm8, %xmm15, %xmm5 -; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm10, %xmm8 +; AVX1-NEXT: vpor %xmm9, %xmm15, %xmm6 +; AVX1-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm13, %xmm10, %xmm9 ; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm10 -; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm10 -; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm8 -; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm5 -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm8 +; AVX1-NEXT: vpor %xmm9, %xmm10, %xmm10 +; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm9 +; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm6 +; AVX1-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm9 ; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm3 -; AVX1-NEXT: vpshufb %xmm14, %xmm8, %xmm12 +; AVX1-NEXT: vpshufb %xmm14, %xmm9, %xmm12 ; AVX1-NEXT: vpor %xmm3, %xmm12, %xmm3 ; AVX1-NEXT: vmovdqu 176(%rdi), %xmm12 ; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm1 @@ -1324,59 +1324,59 @@ ; AVX1-NEXT: vpor %xmm13, %xmm14, %xmm14 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vpshufb %xmm0, %xmm15, %xmm13 -; AVX1-NEXT: vpor %xmm6, %xmm13, %xmm13 -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm13 +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm14 ; AVX1-NEXT: vpor %xmm7, %xmm14, %xmm14 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm8, %xmm7 -; AVX1-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm7 +; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm8[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm11[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm11[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm8, %xmm10, %xmm10 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] ; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 -; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpaddb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm10 +; AVX1-NEXT: vpshufb %xmm11, %xmm9, %xmm9 +; AVX1-NEXT: vpor %xmm9, %xmm10, %xmm9 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 -; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm8 -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpaddb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm10 +; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX1-NEXT: vpor %xmm12, %xmm10, %xmm10 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm5 -; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm8 -; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vpaddb %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm8 +; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm11 +; AVX1-NEXT: vpor %xmm11, %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpshufb %xmm6, %xmm7, %xmm3 +; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm6, %xmm0 +; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm3 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm9, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm6, %xmm14, %xmm3 +; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm3 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm10, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm6, %xmm13, %xmm3 +; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm3 ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX1-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpaddb %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 diff --git a/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll --- a/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll +++ b/llvm/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll @@ -38,10 +38,10 @@ ; CHECK-NEXT: movl %esi, %ecx ; CHECK-NEXT: movl %edi, %edx ; CHECK-NEXT: callq bar@PLT -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %ecx, %edx +; CHECK-NEXT: addl %eax, %edx ; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2ss %eax, %xmm0 +; CHECK-NEXT: cvtsi2ss %edx, %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8 diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll --- a/llvm/test/CodeGen/X86/xmulo.ll +++ b/llvm/test/CodeGen/X86/xmulo.ll @@ -212,66 +212,68 @@ ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $12, %esp +; WIN32-NEXT: subl $8, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ebx, %edi -; WIN32-NEXT: imull %ecx, %edi +; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %eax, %edi -; WIN32-NEXT: addl %edx, %edi -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: imull %ebp, %ecx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: mull %ebp -; WIN32-NEXT: addl %eax, %ecx -; WIN32-NEXT: addl %edx, %ecx -; WIN32-NEXT: addl %esi, %eax -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %edi, %ecx -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: mull %esi -; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: addl %eax, %ebp +; WIN32-NEXT: addl %esi, %ebp +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: imull %ebx, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: mull %ebx +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: addl %edi, %esi +; WIN32-NEXT: addl %eax, %esi +; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: adcl %ebp, %esi +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %esi +; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %edi -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %ebx, %esi +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl %ebp, %ecx ; WIN32-NEXT: adcl $0, %edi -; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: movl %ebx, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: addl %esi, %ebx -; WIN32-NEXT: adcl %edi, %ebp -; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: addl %ecx, %ebp +; WIN32-NEXT: adcl %edi, %ebx +; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %ebp, %eax -; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; WIN32-NEXT: adcl %esi, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl %ebx, %eax +; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: movl %ebx, %ecx +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: adcl %esi, %edx +; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx ; WIN32-NEXT: xorl %eax, %ecx ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %ebx, 4(%eax) +; WIN32-NEXT: movl %ebp, 4(%eax) ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al -; WIN32-NEXT: addl $12, %esp +; WIN32-NEXT: addl $8, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -469,30 +471,28 @@ ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: testl %esi, %esi ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %eax, %eax -; WIN32-NEXT: setne %bl -; WIN32-NEXT: andb %dl, %bl -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: setne %cl +; WIN32-NEXT: andb %dl, %cl +; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: seto %bh +; WIN32-NEXT: seto %bl ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %ecx, %edx +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: seto %ch -; WIN32-NEXT: orb %bh, %ch +; WIN32-NEXT: orb %bl, %ch +; WIN32-NEXT: orb %cl, %ch ; WIN32-NEXT: leal (%edi,%eax), %esi -; WIN32-NEXT: movl %edx, %eax -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: addl %esi, %edx ; WIN32-NEXT: setb %cl ; WIN32-NEXT: orb %ch, %cl -; WIN32-NEXT: orb %bl, %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl %eax, (%esi) ; WIN32-NEXT: movl %edx, 4(%esi) @@ -571,64 +571,66 @@ ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: pushl %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl %ecx, %edx -; WIN32-NEXT: movl %ecx, %edi -; WIN32-NEXT: sarl $31, %edx -; WIN32-NEXT: movl %esi, %ecx -; WIN32-NEXT: imull %edx, %ecx -; WIN32-NEXT: mull %edx -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %eax, %ecx -; WIN32-NEXT: addl %edx, %ecx -; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: imull %edi, %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: mull %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: sarl $31, %ecx +; WIN32-NEXT: movl %ebx, %edi +; WIN32-NEXT: imull %ecx, %edi +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: movl %eax, %ecx ; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %edx, %esi -; WIN32-NEXT: addl %ebp, %eax +; WIN32-NEXT: addl %edi, %esi +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: imull %ebp, %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: addl %edi, %ebx +; WIN32-NEXT: addl %eax, %ebx +; WIN32-NEXT: addl %ecx, %eax ; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill -; WIN32-NEXT: adcl %ecx, %esi -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: adcl %esi, %ebx +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %edx, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: addl %ebp, %ecx -; WIN32-NEXT: adcl $0, %ebx -; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: addl %esi, %edi +; WIN32-NEXT: adcl $0, %ecx +; WIN32-NEXT: movl %ebp, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: movl %edx, %edi -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %ecx, %ebp -; WIN32-NEXT: adcl %ebx, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %eax, %esi +; WIN32-NEXT: addl %edi, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: adcl %ecx, %ebp ; WIN32-NEXT: setb %cl -; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: movl %edi, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %edi, %eax +; WIN32-NEXT: addl %ebp, %eax ; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx ; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload -; WIN32-NEXT: adcl %esi, %edx -; WIN32-NEXT: sarl $31, %ebp -; WIN32-NEXT: xorl %ebp, %edx -; WIN32-NEXT: xorl %eax, %ebp +; WIN32-NEXT: adcl %ebx, %edx +; WIN32-NEXT: sarl $31, %esi +; WIN32-NEXT: xorl %esi, %edx +; WIN32-NEXT: xorl %eax, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: orl %edx, %ebp +; WIN32-NEXT: orl %edx, %esi ; WIN32-NEXT: jne LBB12_2 ; WIN32-NEXT: # %bb.1: ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: LBB12_2: -; WIN32-NEXT: movl %ebx, %edx +; WIN32-NEXT: movl %edi, %edx ; WIN32-NEXT: addl $4, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi @@ -723,13 +725,13 @@ ; WIN32-NEXT: mull %ecx ; WIN32-NEXT: seto %bh ; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload +; WIN32-NEXT: orb %bl, %bh ; WIN32-NEXT: addl %eax, %edi ; WIN32-NEXT: movl %ecx, %eax ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: addl %edi, %edx ; WIN32-NEXT: setb %al ; WIN32-NEXT: orb %bh, %al -; WIN32-NEXT: orb %bl, %al ; WIN32-NEXT: testb %al, %al ; WIN32-NEXT: jne LBB14_2 ; WIN32-NEXT: # %bb.1: @@ -989,63 +991,65 @@ ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $8, %esp +; WIN32-NEXT: pushl %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ebx, %esi +; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %edx, %esi -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: imull %edi, %ecx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx -; WIN32-NEXT: mull %ebx -; WIN32-NEXT: addl %eax, %ecx -; WIN32-NEXT: addl %edx, %ecx -; WIN32-NEXT: addl %ebp, %eax -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %esi, %ecx -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: mull %esi -; WIN32-NEXT: movl %edx, %edi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %esi -; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %edi, %ebp -; WIN32-NEXT: adcl $0, %esi -; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: addl %eax, %ebx +; WIN32-NEXT: addl %esi, %ebx +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: movl %eax, %edi +; WIN32-NEXT: imull %ebp, %edi ; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: addl %edi, %esi +; WIN32-NEXT: addl %eax, %esi +; WIN32-NEXT: addl %ecx, %eax +; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill +; WIN32-NEXT: adcl %ebx, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebx -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: addl %ebp, %edi -; WIN32-NEXT: adcl %esi, %ebx -; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl %ebx, %ecx +; WIN32-NEXT: adcl $0, %ebp +; WIN32-NEXT: movl %edi, %eax +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: addl %ecx, %ebx +; WIN32-NEXT: adcl %ebp, %edi +; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: addl %ebx, %eax -; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; WIN32-NEXT: adcl %esi, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: addl %edi, %eax +; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: sarl $31, %edi -; WIN32-NEXT: xorl %edi, %edx -; WIN32-NEXT: xorl %eax, %edi -; WIN32-NEXT: orl %edx, %edi +; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; WIN32-NEXT: adcl %esi, %edx +; WIN32-NEXT: sarl $31, %ebx +; WIN32-NEXT: xorl %ebx, %edx +; WIN32-NEXT: xorl %eax, %ebx +; WIN32-NEXT: orl %edx, %ebx ; WIN32-NEXT: jne LBB18_1 ; WIN32-NEXT: # %bb.3: # %continue ; WIN32-NEXT: movb $1, %al ; WIN32-NEXT: LBB18_2: # %overflow -; WIN32-NEXT: addl $8, %esp +; WIN32-NEXT: addl $4, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -1313,30 +1317,28 @@ ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: testl %esi, %esi ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %eax, %eax -; WIN32-NEXT: setne %bl -; WIN32-NEXT: andb %dl, %bl -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: setne %cl +; WIN32-NEXT: andb %dl, %cl +; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: seto %bh +; WIN32-NEXT: seto %bl ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %ecx, %edx -; WIN32-NEXT: seto %cl -; WIN32-NEXT: orb %bh, %cl -; WIN32-NEXT: leal (%edi,%eax), %esi -; WIN32-NEXT: movl %edx, %eax ; WIN32-NEXT: mull %ebp +; WIN32-NEXT: seto %ch +; WIN32-NEXT: orb %bl, %ch +; WIN32-NEXT: orb %cl, %ch +; WIN32-NEXT: leal (%edi,%eax), %esi +; WIN32-NEXT: movl %ebp, %eax +; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: addl %esi, %edx ; WIN32-NEXT: setb %al -; WIN32-NEXT: orb %cl, %al -; WIN32-NEXT: orb %bl, %al +; WIN32-NEXT: orb %ch, %al ; WIN32-NEXT: subb $1, %al ; WIN32-NEXT: je LBB22_1 ; WIN32-NEXT: # %bb.3: # %continue @@ -1694,68 +1696,72 @@ ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: subl $20, %esp +; WIN32-NEXT: subl $16, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl (%eax), %edx -; WIN32-NEXT: movl %edx, (%esp) # 4-byte Spill -; WIN32-NEXT: movl 4(%eax), %ebx +; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl 4(%eax), %esi +; WIN32-NEXT: movl %esi, (%esp) # 4-byte Spill ; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: movl %ecx, %edi ; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: imull %ebx, %esi +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: imull %esi, %ecx ; WIN32-NEXT: mull %edx ; WIN32-NEXT: movl %eax, %ebp -; WIN32-NEXT: addl %eax, %esi -; WIN32-NEXT: addl %edx, %esi -; WIN32-NEXT: movl %ebx, %edi -; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: sarl $31, %edi -; WIN32-NEXT: imull %edi, %ecx +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: addl %ecx, %ebx +; WIN32-NEXT: movl %esi, %ecx +; WIN32-NEXT: sarl $31, %ecx +; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %edi -; WIN32-NEXT: addl %eax, %ecx -; WIN32-NEXT: addl %edx, %ecx +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %edi +; WIN32-NEXT: addl %eax, %edi +; WIN32-NEXT: addl %esi, %edi +; WIN32-NEXT: addl %ebp, %ebx ; WIN32-NEXT: addl %eax, %ebp ; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %esi, %ecx -; WIN32-NEXT: movl (%esp), %esi # 4-byte Reload +; WIN32-NEXT: adcl %ebx, %edi +; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN32-NEXT: mull %edi -; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: mull %edi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; WIN32-NEXT: mull %ecx ; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; WIN32-NEXT: adcl $0, %ebp +; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl (%esp), %eax # 4-byte Reload +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl %ebp, %ecx +; WIN32-NEXT: adcl $0, %ebx ; WIN32-NEXT: movl %esi, %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: movl %edx, %esi -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: addl %edi, %ebx -; WIN32-NEXT: adcl %ebp, %esi -; WIN32-NEXT: setb (%esp) # 1-byte Folded Spill -; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: addl %ecx, %ebp +; WIN32-NEXT: adcl %ebx, %esi +; WIN32-NEXT: setb %cl +; WIN32-NEXT: movl (%esp), %eax # 4-byte Reload ; WIN32-NEXT: mull {{[0-9]+}}(%esp) ; WIN32-NEXT: addl %esi, %eax -; WIN32-NEXT: movzbl (%esp), %esi # 1-byte Folded Reload -; WIN32-NEXT: adcl %esi, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: movl %ebx, %ecx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: adcl %edi, %edx +; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx ; WIN32-NEXT: xorl %eax, %ecx ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %ebx, 4(%eax) +; WIN32-NEXT: movl %ebp, 4(%eax) ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al -; WIN32-NEXT: addl $20, %esp +; WIN32-NEXT: addl $16, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -1802,59 +1808,61 @@ ; WIN32-NEXT: subl $16, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %ebx -; WIN32-NEXT: movl 4(%eax), %ebp -; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; WIN32-NEXT: movl (%eax), %ebp +; WIN32-NEXT: movl 4(%eax), %ebx +; WIN32-NEXT: movl %ebx, (%esp) # 4-byte Spill ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %ebp, %edi -; WIN32-NEXT: imull %ecx, %edi -; WIN32-NEXT: movl %ebx, %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %eax, %edi -; WIN32-NEXT: addl %edx, %edi +; WIN32-NEXT: movl %ebx, %esi +; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: sarl $31, %eax -; WIN32-NEXT: movl %eax, %ecx -; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: mull %ebp +; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %eax, %edi ; WIN32-NEXT: addl %eax, %ecx -; WIN32-NEXT: addl %edx, %ecx -; WIN32-NEXT: addl %esi, %eax +; WIN32-NEXT: addl %esi, %ecx +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: sarl $31, %eax +; WIN32-NEXT: movl %eax, %ebx +; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: movl %edx, %esi +; WIN32-NEXT: addl %ebx, %esi +; WIN32-NEXT: addl %eax, %esi +; WIN32-NEXT: addl %edi, %eax ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; WIN32-NEXT: adcl %edi, %ecx -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull %ebx -; WIN32-NEXT: movl %edx, %ebp +; WIN32-NEXT: adcl %ecx, %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull %ebp +; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull %ebx +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %edx, %edi -; WIN32-NEXT: movl %eax, %esi -; WIN32-NEXT: addl %ebp, %esi +; WIN32-NEXT: movl %eax, %ecx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; WIN32-NEXT: adcl $0, %edi +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull (%esp) # 4-byte Folded Reload +; WIN32-NEXT: movl %edx, %ebx +; WIN32-NEXT: movl %eax, %ebp +; WIN32-NEXT: addl %ecx, %ebp +; WIN32-NEXT: adcl %edi, %ebx +; WIN32-NEXT: setb %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; WIN32-NEXT: movl %edx, %ebp -; WIN32-NEXT: movl %eax, %ebx -; WIN32-NEXT: addl %esi, %ebx -; WIN32-NEXT: adcl %edi, %ebp -; WIN32-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; WIN32-NEXT: addl %ebp, %eax -; WIN32-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; WIN32-NEXT: adcl %esi, %edx -; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: mull (%esp) # 4-byte Folded Reload +; WIN32-NEXT: addl %ebx, %eax +; WIN32-NEXT: movzbl %cl, %ecx ; WIN32-NEXT: adcl %ecx, %edx -; WIN32-NEXT: movl %ebx, %ecx +; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; WIN32-NEXT: adcl %esi, %edx +; WIN32-NEXT: movl %ebp, %ecx ; WIN32-NEXT: sarl $31, %ecx ; WIN32-NEXT: xorl %ecx, %edx ; WIN32-NEXT: xorl %eax, %ecx ; WIN32-NEXT: orl %edx, %ecx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl %ebx, 4(%eax) +; WIN32-NEXT: movl %ebp, 4(%eax) ; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; WIN32-NEXT: movl %ecx, (%eax) ; WIN32-NEXT: setne %al @@ -2213,36 +2221,34 @@ ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: pushl %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl (%eax), %ecx +; WIN32-NEXT: movl (%eax), %esi ; WIN32-NEXT: movl 4(%eax), %eax -; WIN32-NEXT: testl %esi, %esi +; WIN32-NEXT: testl %ebx, %ebx ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %eax, %eax -; WIN32-NEXT: setne %bl -; WIN32-NEXT: andb %dl, %bl +; WIN32-NEXT: setne %cl +; WIN32-NEXT: andb %dl, %cl ; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; WIN32-NEXT: seto %ch +; WIN32-NEXT: movl %ebx, %eax +; WIN32-NEXT: mull %esi +; WIN32-NEXT: seto %bl +; WIN32-NEXT: orb %ch, %bl +; WIN32-NEXT: orb %cl, %bl +; WIN32-NEXT: leal (%edi,%eax), %ecx ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ecx -; WIN32-NEXT: seto %bh -; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload -; WIN32-NEXT: leal (%edi,%eax), %esi -; WIN32-NEXT: movl %ecx, %eax ; WIN32-NEXT: mull %ebp -; WIN32-NEXT: addl %esi, %edx +; WIN32-NEXT: addl %ecx, %edx ; WIN32-NEXT: setb %cl -; WIN32-NEXT: orb %bh, %cl ; WIN32-NEXT: orb %bl, %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl %eax, (%esi) ; WIN32-NEXT: movl %edx, 4(%esi) ; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: addl $4, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx @@ -2293,36 +2299,33 @@ ; WIN32-NEXT: pushl %ebx ; WIN32-NEXT: pushl %edi ; WIN32-NEXT: pushl %esi -; WIN32-NEXT: pushl %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx -; WIN32-NEXT: movl (%edx), %ecx +; WIN32-NEXT: movl (%edx), %ebp ; WIN32-NEXT: movl 4(%edx), %esi ; WIN32-NEXT: testl %eax, %eax ; WIN32-NEXT: setne %dl ; WIN32-NEXT: testl %esi, %esi -; WIN32-NEXT: setne %bl -; WIN32-NEXT: andb %dl, %bl -; WIN32-NEXT: mull %ecx +; WIN32-NEXT: setne %cl +; WIN32-NEXT: andb %dl, %cl +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: movl %eax, %edi -; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; WIN32-NEXT: seto %bl ; WIN32-NEXT: movl %esi, %eax -; WIN32-NEXT: mull %ebp -; WIN32-NEXT: seto %bh -; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload +; WIN32-NEXT: mull {{[0-9]+}}(%esp) +; WIN32-NEXT: seto %ch +; WIN32-NEXT: orb %bl, %ch +; WIN32-NEXT: orb %cl, %ch ; WIN32-NEXT: leal (%edi,%eax), %esi -; WIN32-NEXT: movl %ebp, %eax -; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax +; WIN32-NEXT: mull %ebp ; WIN32-NEXT: addl %esi, %edx ; WIN32-NEXT: setb %cl -; WIN32-NEXT: orb %bh, %cl -; WIN32-NEXT: orb %bl, %cl +; WIN32-NEXT: orb %ch, %cl ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl %eax, (%esi) ; WIN32-NEXT: movl %edx, 4(%esi) ; WIN32-NEXT: movl %ecx, %eax -; WIN32-NEXT: addl $4, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: popl %edi ; WIN32-NEXT: popl %ebx