Index: lib/Target/X86/X86FastISel.cpp =================================================================== --- lib/Target/X86/X86FastISel.cpp +++ lib/Target/X86/X86FastISel.cpp @@ -1538,26 +1538,7 @@ return false; } - if (DstVT == MVT::i64) { - // Handle extension to 64-bits via sub-register shenanigans. - unsigned MovInst; - - switch (SrcVT.SimpleTy) { - case MVT::i8: MovInst = X86::MOVZX32rr8; break; - case MVT::i16: MovInst = X86::MOVZX32rr16; break; - case MVT::i32: MovInst = X86::MOV32rr; break; - default: llvm_unreachable("Unexpected zext to i64 source type"); - } - - unsigned Result32 = createResultReg(&X86::GR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32) - .addReg(ResultReg); - - ResultReg = createResultReg(&X86::GR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), - ResultReg) - .addImm(0).addReg(Result32).addImm(X86::sub_32bit); - } else if (DstVT == MVT::i16) { + if (DstVT == MVT::i16) { // i8->i16 doesn't exist in the autogenerated isel table. Need to zero // extend to 32-bits and then extract down to 16-bits. unsigned Result32 = createResultReg(&X86::GR32RegClass); Index: lib/Target/X86/X86FixupSetCC.cpp =================================================================== --- lib/Target/X86/X86FixupSetCC.cpp +++ lib/Target/X86/X86FixupSetCC.cpp @@ -134,7 +134,8 @@ MachineInstr *ZExt = nullptr; for (auto &Use : MRI->use_instructions(MI.getOperand(0).getReg())) - if (Use.getOpcode() == X86::MOVZX32rr8) + if (Use.getOpcode() == X86::MOVZX32rr8 || + Use.getOpcode() == X86::MOVZX64rr8_alt) ZExt = &Use; if (!ZExt) @@ -162,14 +163,24 @@ ? &X86::GR32RegClass : &X86::GR32_ABCDRegClass; unsigned ZeroReg = MRI->createVirtualRegister(RC); - unsigned InsertReg = MRI->createVirtualRegister(RC); // Initialize a register with 0. This must go before the eflags def BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), TII->get(X86::MOV32r0), ZeroReg); + // If this is a 64-bit zero extend we need to wrap with a subreg_to_reg. + if (ZExt->getOpcode() == X86::MOVZX64rr8_alt) { + RC = &X86::GR64RegClass; + unsigned Reg = MRI->createVirtualRegister(RC); + BuildMI(MBB, FlagsDefMI, MI.getDebugLoc(), + TII->get(TargetOpcode::SUBREG_TO_REG), Reg) + .addImm(0).addReg(ZeroReg).addImm(X86::sub_32bit); + ZeroReg = Reg; + } + // X86 setcc only takes an output GR8, so fake a GR32 input by inserting // the setcc result into the low byte of the zeroed register. + unsigned InsertReg = MRI->createVirtualRegister(RC); BuildMI(*ZExt->getParent(), ZExt, ZExt->getDebugLoc(), TII->get(X86::INSERT_SUBREG), InsertReg) .addReg(ZeroReg) Index: lib/Target/X86/X86InstrCompiler.td =================================================================== --- lib/Target/X86/X86InstrCompiler.td +++ lib/Target/X86/X86InstrCompiler.td @@ -1236,10 +1236,8 @@ def : Pat<(i32 (anyext GR16:$src)), (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>; -def : Pat<(i64 (anyext GR8 :$src)), - (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>; -def : Pat<(i64 (anyext GR16:$src)), - (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>; +def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8_alt GR8 :$src)>; +def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16_alt GR16 :$src)>; def : Pat<(i64 (anyext GR32:$src)), (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; @@ -1412,19 +1410,13 @@ // r & (2^32-1) ==> movz def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), - (SUBREG_TO_REG (i64 0), - (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), - sub_32bit)>; + (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; // r & (2^16-1) ==> movz def : Pat<(and GR64:$src, 0xffff), - (SUBREG_TO_REG (i64 0), - (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), - sub_32bit)>; + (MOVZX64rr16_alt (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>; // r & (2^8-1) ==> movz def : Pat<(and GR64:$src, 0xff), - (SUBREG_TO_REG (i64 0), - (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))), - sub_32bit)>; + (MOVZX64rr8_alt (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>; // r & (2^8-1) ==> movz def : Pat<(and GR32:$src1, 0xff), (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>, @@ -1475,6 +1467,12 @@ (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; def: Pat<(zextloadi16i8 addr:$src), (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; +def: Pat<(zextloadi64i8 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; +def: Pat<(zextloadi64i16 addr:$src), + (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; +def: Pat<(zextloadi64i32 addr:$src), + (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; // trunc patterns def : Pat<(i16 (trunc GR32:$src)), Index: lib/Target/X86/X86InstrExtension.td =================================================================== --- lib/Target/X86/X86InstrExtension.td +++ lib/Target/X86/X86InstrExtension.td @@ -163,24 +163,31 @@ TB, Sched<[WriteALULd]>; } -// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a -// 32-bit register. -def : Pat<(i64 (zext GR8:$src)), - (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>; -def : Pat<(zextloadi64i8 addr:$src), - (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; - -def : Pat<(i64 (zext GR16:$src)), - (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>; -def : Pat<(zextloadi64i16 addr:$src), - (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; - -// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a -// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible -// when the 32-bit value is defined by a truncate or is copied from something -// where the high bits aren't necessarily all zero. In such cases, we fall back -// to these explicit zext instructions. -def : Pat<(i64 (zext GR32:$src)), - (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>; -def : Pat<(i64 (zextloadi64i32 addr:$src)), - (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; +let isPseudo = 1 in { + +// Use movzbl instead of movzbq when the destination is a register; it's +// equivalent due to implicit zero-extending, and it has a smaller encoding. +// This will be converted to MOVZX32rr8 after register allocation. +def MOVZX64rr8_alt : I<0, Pseudo, (outs GR64:$dst), (ins GR8 :$src), + "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, + Sched<[WriteALU]>; +// Use movzwl instead of movzwq when the destination is a register; it's +// equivalent due to implicit zero-extending, and it has a smaller encoding. +// This will be converted to MOVZX32rr16 after register allocation. +def MOVZX64rr16_alt: I<0, Pseudo, (outs GR64:$dst), (ins GR16:$src), + "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, + Sched<[WriteALU]>; + +// There's no movzlq instruction, but movl can be used for this purpose, using +// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero +// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit +// zero-extension, however this isn't possible when the 32-bit value is +// defined by a truncate or is copied from something where the high bits aren't +// necessarily all zero. In such cases, we fall back to these explicit zext +// instructions. +// This will be converted to MOV32rr after register allocation. +def MOVZX64rr32 : I<0, Pseudo, (outs GR64:$dst), (ins GR32:$src), + "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>, + Sched<[WriteALU]>; +} + Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -3681,6 +3681,7 @@ case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: + case X86::MOVZX64rr8_alt: if (!Subtarget.is64Bit()) // It's not always legal to reference the low 8-bit of the larger // register in 32-bit mode. @@ -3689,7 +3690,9 @@ case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: - case X86::MOVSX64rr32: { + case X86::MOVZX64rr16_alt: + case X86::MOVSX64rr32: + case X86::MOVZX64rr32: { if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) // Be conservative. return false; @@ -3702,14 +3705,17 @@ case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: + case X86::MOVZX64rr8_alt: SubIdx = X86::sub_8bit; break; case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: + case X86::MOVZX64rr16_alt: SubIdx = X86::sub_16bit; break; case X86::MOVSX64rr32: + case X86::MOVZX64rr32: SubIdx = X86::sub_32bit; break; } @@ -7700,10 +7706,27 @@ return true; } + +static bool expandZeroExtend64(MachineInstrBuilder &MIB, + const MCInstrDesc &Desc) { + MIB->setDesc(Desc); + unsigned DestReg = MIB->getOperand(0).getReg(); + MIB->getOperand(0).setReg(getX86SubSuperRegister(DestReg, 32)); + MIB.addReg(DestReg, RegState::ImplicitDefine); + return true; +} + + bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); switch (MI.getOpcode()) { + case X86::MOVZX64rr8_alt: + return expandZeroExtend64(MIB, get(X86::MOVZX32rr8)); + case X86::MOVZX64rr16_alt: + return expandZeroExtend64(MIB, get(X86::MOVZX32rr16)); + case X86::MOVZX64rr32: + return expandZeroExtend64(MIB, get(X86::MOV32rr)); case X86::MOV32r0: return Expand2AddrUndef(MIB, get(X86::XOR32rr)); case X86::MOV32r1: Index: test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir =================================================================== --- test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir +++ test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir @@ -151,7 +151,7 @@ - { id: 2, class: gpr } # ALL: %0 = COPY %rdi # ALL-NEXT: %1 = COPY %0.sub_8bit -# ALL-NEXT: %2 = SUBREG_TO_REG 0, %1, 1 +# ALL-NEXT: %2 = MOVZX64rr8_alt %1 # ALL-NEXT: %rax = COPY %2 # ALL-NEXT: RET 0, implicit %rax body: | @@ -180,7 +180,7 @@ - { id: 2, class: gpr } # ALL: %0 = COPY %rdi # ALL-NEXT: %1 = COPY %0.sub_16bit -# ALL-NEXT: %2 = SUBREG_TO_REG 0, %1, 3 +# ALL-NEXT: %2 = MOVZX64rr16_alt %1 # ALL-NEXT: %rax = COPY %2 # ALL-NEXT: RET 0, implicit %rax body: | Index: test/CodeGen/X86/MergeConsecutiveStores.ll =================================================================== --- test/CodeGen/X86/MergeConsecutiveStores.ll +++ test/CodeGen/X86/MergeConsecutiveStores.ll @@ -451,30 +451,28 @@ define void @MergeLoadStoreBaseIndexOffset(i64* %a, i8* %b, i8* %c, i32 %n) { ; BWON-LABEL: MergeLoadStoreBaseIndexOffset: ; BWON: # BB#0: -; BWON-NEXT: movl %ecx, %r8d -; BWON-NEXT: xorl %ecx, %ecx +; BWON-NEXT: xorl %r8d, %r8d ; BWON-NEXT: .p2align 4, 0x90 ; BWON-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 -; BWON-NEXT: movq (%rdi,%rcx,8), %rax +; BWON-NEXT: movq (%rdi,%r8,8), %rax ; BWON-NEXT: movzwl (%rdx,%rax), %eax -; BWON-NEXT: movw %ax, (%rsi,%rcx,2) -; BWON-NEXT: incq %rcx -; BWON-NEXT: cmpl %ecx, %r8d +; BWON-NEXT: movw %ax, (%rsi,%r8,2) +; BWON-NEXT: incq %r8 +; BWON-NEXT: cmpl %r8d, %ecx ; BWON-NEXT: jne .LBB9_1 ; BWON-NEXT: # BB#2: ; BWON-NEXT: retq ; ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffset: ; BWOFF: # BB#0: -; BWOFF-NEXT: movl %ecx, %r8d -; BWOFF-NEXT: xorl %ecx, %ecx +; BWOFF-NEXT: xorl %r8d, %r8d ; BWOFF-NEXT: .p2align 4, 0x90 ; BWOFF-NEXT: .LBB9_1: # =>This Inner Loop Header: Depth=1 -; BWOFF-NEXT: movq (%rdi,%rcx,8), %rax +; BWOFF-NEXT: movq (%rdi,%r8,8), %rax ; BWOFF-NEXT: movw (%rdx,%rax), %ax -; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2) -; BWOFF-NEXT: incq %rcx -; BWOFF-NEXT: cmpl %ecx, %r8d +; BWOFF-NEXT: movw %ax, (%rsi,%r8,2) +; BWOFF-NEXT: incq %r8 +; BWOFF-NEXT: cmpl %r8d, %ecx ; BWOFF-NEXT: jne .LBB9_1 ; BWOFF-NEXT: # BB#2: ; BWOFF-NEXT: retq @@ -567,30 +565,28 @@ define void @MergeLoadStoreBaseIndexOffsetSext(i8* %a, i8* %b, i8* %c, i32 %n) { ; BWON-LABEL: MergeLoadStoreBaseIndexOffsetSext: ; BWON: # BB#0: -; BWON-NEXT: movl %ecx, %r8d -; BWON-NEXT: xorl %ecx, %ecx +; BWON-NEXT: xorl %r8d, %r8d ; BWON-NEXT: .p2align 4, 0x90 ; BWON-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 -; BWON-NEXT: movsbq (%rdi,%rcx), %rax +; BWON-NEXT: movsbq (%rdi,%r8), %rax ; BWON-NEXT: movzwl (%rdx,%rax), %eax -; BWON-NEXT: movw %ax, (%rsi,%rcx,2) -; BWON-NEXT: incq %rcx -; BWON-NEXT: cmpl %ecx, %r8d +; BWON-NEXT: movw %ax, (%rsi,%r8,2) +; BWON-NEXT: incq %r8 +; BWON-NEXT: cmpl %r8d, %ecx ; BWON-NEXT: jne .LBB11_1 ; BWON-NEXT: # BB#2: ; BWON-NEXT: retq ; ; BWOFF-LABEL: MergeLoadStoreBaseIndexOffsetSext: ; BWOFF: # BB#0: -; BWOFF-NEXT: movl %ecx, %r8d -; BWOFF-NEXT: xorl %ecx, %ecx +; BWOFF-NEXT: xorl %r8d, %r8d ; BWOFF-NEXT: .p2align 4, 0x90 ; BWOFF-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 -; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax +; BWOFF-NEXT: movsbq (%rdi,%r8), %rax ; BWOFF-NEXT: movw (%rdx,%rax), %ax -; BWOFF-NEXT: movw %ax, (%rsi,%rcx,2) -; BWOFF-NEXT: incq %rcx -; BWOFF-NEXT: cmpl %ecx, %r8d +; BWOFF-NEXT: movw %ax, (%rsi,%r8,2) +; BWOFF-NEXT: incq %r8 +; BWOFF-NEXT: cmpl %r8d, %ecx ; BWOFF-NEXT: jne .LBB11_1 ; BWOFF-NEXT: # BB#2: ; BWOFF-NEXT: retq @@ -625,38 +621,36 @@ define void @loadStoreBaseIndexOffsetSextNoSex(i8* %a, i8* %b, i8* %c, i32 %n) { ; BWON-LABEL: loadStoreBaseIndexOffsetSextNoSex: ; BWON: # BB#0: -; BWON-NEXT: movl %ecx, %r8d -; BWON-NEXT: xorl %ecx, %ecx +; BWON-NEXT: xorl %r10d, %r10d ; BWON-NEXT: .p2align 4, 0x90 ; BWON-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 -; BWON-NEXT: movsbq (%rdi,%rcx), %rax -; BWON-NEXT: movzbl (%rdx,%rax), %r9d -; BWON-NEXT: incb %al -; BWON-NEXT: movsbq %al, %rax +; BWON-NEXT: movsbq (%rdi,%r10), %r8 +; BWON-NEXT: movzbl (%rdx,%r8), %r9d +; BWON-NEXT: incb %r8b +; BWON-NEXT: movsbq %r8b, %rax ; BWON-NEXT: movzbl (%rdx,%rax), %eax -; BWON-NEXT: movb %r9b, (%rsi,%rcx,2) -; BWON-NEXT: movb %al, 1(%rsi,%rcx,2) -; BWON-NEXT: incq %rcx -; BWON-NEXT: cmpl %ecx, %r8d +; BWON-NEXT: movb %r9b, (%rsi,%r10,2) +; BWON-NEXT: movb %al, 1(%rsi,%r10,2) +; BWON-NEXT: incq %r10 +; BWON-NEXT: cmpl %r10d, %ecx ; BWON-NEXT: jne .LBB12_1 ; BWON-NEXT: # BB#2: ; BWON-NEXT: retq ; ; BWOFF-LABEL: loadStoreBaseIndexOffsetSextNoSex: ; BWOFF: # BB#0: -; BWOFF-NEXT: movl %ecx, %r8d -; BWOFF-NEXT: xorl %ecx, %ecx +; BWOFF-NEXT: xorl %r10d, %r10d ; BWOFF-NEXT: .p2align 4, 0x90 ; BWOFF-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 -; BWOFF-NEXT: movsbq (%rdi,%rcx), %rax -; BWOFF-NEXT: movb (%rdx,%rax), %r9b -; BWOFF-NEXT: incb %al -; BWOFF-NEXT: movsbq %al, %rax +; BWOFF-NEXT: movsbq (%rdi,%r10), %r8 +; BWOFF-NEXT: movb (%rdx,%r8), %r9b +; BWOFF-NEXT: incb %r8b +; BWOFF-NEXT: movsbq %r8b, %rax ; BWOFF-NEXT: movb (%rdx,%rax), %al -; BWOFF-NEXT: movb %r9b, (%rsi,%rcx,2) -; BWOFF-NEXT: movb %al, 1(%rsi,%rcx,2) -; BWOFF-NEXT: incq %rcx -; BWOFF-NEXT: cmpl %ecx, %r8d +; BWOFF-NEXT: movb %r9b, (%rsi,%r10,2) +; BWOFF-NEXT: movb %al, 1(%rsi,%r10,2) +; BWOFF-NEXT: incq %r10 +; BWOFF-NEXT: cmpl %r10d, %ecx ; BWOFF-NEXT: jne .LBB12_1 ; BWOFF-NEXT: # BB#2: ; BWOFF-NEXT: retq Index: test/CodeGen/X86/mul-i1024.ll =================================================================== --- test/CodeGen/X86/mul-i1024.ll +++ test/CodeGen/X86/mul-i1024.ll @@ -4246,46 +4246,46 @@ ; X64-NEXT: pushq %rbx ; X64-NEXT: subq $352, %rsp # imm = 0x160 ; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq 48(%rdi), %r9 -; X64-NEXT: movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq 40(%rdi), %rbp +; X64-NEXT: movq %rdi, %r13 +; X64-NEXT: movq 48(%r13), %rcx +; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq 40(%r13), %rbp ; X64-NEXT: movq %rbp, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq 32(%rdi), %rax +; X64-NEXT: movq 32(%r13), %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %rdi, %r10 ; X64-NEXT: xorl %r8d, %r8d ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rdi -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq %rax, %r9 ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rdi, %rbx ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: addq %rcx, %rbx +; X64-NEXT: addq %r9, %rbx ; X64-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %rcx, %r11 ; X64-NEXT: adcq %rdi, %rbp ; X64-NEXT: setb %bl ; X64-NEXT: movzbl %bl, %ebx ; X64-NEXT: addq %rax, %rbp ; X64-NEXT: adcq %rdx, %rbx -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %r11, %r12 -; X64-NEXT: movq %r11, %r8 -; X64-NEXT: addq %rax, %r12 +; X64-NEXT: movq %r9, %r14 +; X64-NEXT: movq %r9, %r8 +; X64-NEXT: addq %rax, %r14 ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: movq %rdi, %r9 -; X64-NEXT: movq %r9, (%rsp) # 8-byte Spill +; X64-NEXT: movq %rdi, %r10 +; X64-NEXT: movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq %rdx, %rax -; X64-NEXT: addq %rbp, %r12 -; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: addq %rbp, %r14 +; X64-NEXT: movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq %rbx, %rax -; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: movq %r15, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq (%rsi), %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: xorl %ebp, %ebp @@ -4296,11 +4296,11 @@ ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: mulq %rbp ; X64-NEXT: xorl %r11d, %r11d -; X64-NEXT: movq %rax, %r15 -; X64-NEXT: addq %rcx, %r15 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: addq %rdi, %r15 +; X64-NEXT: addq %rdi, %r9 ; X64-NEXT: adcq %rcx, %rbp ; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: setb %bl @@ -4308,160 +4308,157 @@ ; X64-NEXT: movzbl %bl, %ebx ; X64-NEXT: adcq %rdx, %rbx ; X64-NEXT: movq 16(%rsi), %rax -; X64-NEXT: movq %rsi, %r13 -; X64-NEXT: movq %r13, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rsi, %r12 +; X64-NEXT: movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %rdi, %r14 -; X64-NEXT: addq %rax, %r14 -; X64-NEXT: movq %rcx, %r11 -; X64-NEXT: adcq %rdx, %r11 -; X64-NEXT: addq %rbp, %r14 -; X64-NEXT: adcq %rbx, %r11 +; X64-NEXT: movq %rdi, %r11 +; X64-NEXT: addq %rax, %r11 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: adcq %rdx, %rax +; X64-NEXT: addq %rbp, %r11 +; X64-NEXT: adcq %rbx, %rax +; X64-NEXT: movq %rax, %rbp ; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %r8, %rbp -; X64-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movq %r9, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq (%r10), %rax +; X64-NEXT: movq (%r13), %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: xorl %r8d, %r8d -; X64-NEXT: mulq %r8 +; X64-NEXT: xorl %r10d, %r10d +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rdi, %rax -; X64-NEXT: movq %rdi, %r9 ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: adcq %rcx, %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq 32(%r13), %rax +; X64-NEXT: movq 32(%r12), %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: mulq %r8 -; X64-NEXT: xorl %r8d, %r8d -; X64-NEXT: movq %rax, %r13 +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: movq %rbx, %rcx -; X64-NEXT: addq %r13, %rax +; X64-NEXT: addq %r12, %rax ; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq %rdx, %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %rbp, %rax -; X64-NEXT: addq %r9, %rax -; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: addq %rdi, %r8 +; X64-NEXT: movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rdi, %r10 +; X64-NEXT: movq %r10, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; X64-NEXT: adcq %r15, %rax +; X64-NEXT: adcq %r9, %rax ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: adcq %r14, %r12 -; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; X64-NEXT: adcq %r11, %rax +; X64-NEXT: adcq %r11, %r14 +; X64-NEXT: movq %r14, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %r15, %rax +; X64-NEXT: adcq %rbp, %rax ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %r11, %rdi -; X64-NEXT: movq 8(%r10), %rax +; X64-NEXT: movq %rbp, %rdi +; X64-NEXT: movq 8(%r13), %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: xorl %r8d, %r8d ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rax, %r11 -; X64-NEXT: addq %rsi, %r11 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %rsi, %r15 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: addq %rcx, %r11 +; X64-NEXT: addq %rcx, %r15 ; X64-NEXT: adcq %rsi, %rbp -; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: setb %bl ; X64-NEXT: addq %rax, %rbp ; X64-NEXT: movzbl %bl, %ebx ; X64-NEXT: adcq %rdx, %rbx -; X64-NEXT: movq 16(%r10), %rax +; X64-NEXT: movq 16(%r13), %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %r13, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rcx, %r8 ; X64-NEXT: addq %rax, %r8 -; X64-NEXT: movq %rsi, %r10 -; X64-NEXT: adcq %rdx, %r10 +; X64-NEXT: adcq %rdx, %rsi ; X64-NEXT: addq %rbp, %r8 -; X64-NEXT: movq %r8, %rax -; X64-NEXT: adcq %rbx, %r10 -; X64-NEXT: movq %rcx, %rdx -; X64-NEXT: movq %rcx, %r12 -; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: addq %r9, %rdx -; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %r11, %r8 -; X64-NEXT: adcq %r8, %r15 -; X64-NEXT: movq %r15, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: adcq %rax, %r14 -; X64-NEXT: movq %r14, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %rax, %rcx -; X64-NEXT: adcq %r10, %rdi +; X64-NEXT: adcq %rbx, %rsi +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: movq %rcx, %rsi +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq %r14, (%rsp) # 8-byte Spill +; X64-NEXT: addq %r10, %rsi +; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %r15, %rsi +; X64-NEXT: adcq %rsi, %r9 +; X64-NEXT: movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: adcq %r8, %r11 +; X64-NEXT: movq %r11, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: adcq %rdx, %rdi ; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload -; X64-NEXT: movq 40(%rsi), %rax +; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r15 # 8-byte Reload +; X64-NEXT: movq 40(%r15), %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: xorl %r14d, %r14d -; X64-NEXT: mulq %r14 +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r9 # 8-byte Reload ; X64-NEXT: addq %r9, %rdi ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: addq %r13, %rdi +; X64-NEXT: addq %r12, %rdi ; X64-NEXT: adcq %r9, %rbp ; X64-NEXT: setb %bl ; X64-NEXT: addq %rax, %rbp -; X64-NEXT: movzbl %bl, %r11d -; X64-NEXT: adcq %rdx, %r11 -; X64-NEXT: movq 48(%rsi), %rax -; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: mulq %r14 +; X64-NEXT: movzbl %bl, %ebx +; X64-NEXT: adcq %rdx, %rbx +; X64-NEXT: movq 48(%r15), %rax ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %r13, %rbx -; X64-NEXT: addq %rax, %rbx -; X64-NEXT: movq %r9, %rsi -; X64-NEXT: adcq %rdx, %rsi -; X64-NEXT: addq %rbp, %rbx -; X64-NEXT: adcq %r11, %rsi -; X64-NEXT: movq %r13, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: addq %r13, %r12 -; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: adcq %rdi, %r8 -; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %r12, %r15 +; X64-NEXT: addq %rax, %r15 +; X64-NEXT: movq %r9, %rcx +; X64-NEXT: adcq %rdx, %rcx +; X64-NEXT: addq %rbp, %r15 ; X64-NEXT: adcq %rbx, %rcx -; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: adcq %rsi, %r10 -; X64-NEXT: movq %r10, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: addq %r12, %r14 +; X64-NEXT: movq %r14, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: adcq %rdi, %rsi +; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: adcq %r15, %r8 +; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: adcq %rcx, %r11 +; X64-NEXT: movq %r11, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload ; X64-NEXT: movq %rdx, %rax -; X64-NEXT: addq %r13, %rax -; X64-NEXT: movq (%rsp), %rax # 8-byte Reload -; X64-NEXT: adcq %r9, %rax -; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: addq %r12, %rax +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload +; X64-NEXT: adcq %r9, %rsi +; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rdx, %rax -; X64-NEXT: addq %r13, %rax +; X64-NEXT: addq %r12, %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload ; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload -; X64-NEXT: movq %rbx, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload -; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %r15 # 8-byte Folded Reload +; X64-NEXT: movq %r15, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload +; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; X64-NEXT: movq 56(%rax), %r11 +; X64-NEXT: movq 56(%r13), %r11 ; X64-NEXT: movq %r11, %rax ; X64-NEXT: movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: mulq %rdi @@ -4474,8 +4471,8 @@ ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r8 -; X64-NEXT: addq %rbx, %r8 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rbx, %r9 ; X64-NEXT: adcq %rbp, %rsi ; X64-NEXT: setb %cl ; X64-NEXT: movq %r11, %rax @@ -4520,8 +4517,8 @@ ; X64-NEXT: adcq %rax, %r13 ; X64-NEXT: addq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload ; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %r13 # 8-byte Folded Reload -; X64-NEXT: addq %r9, %rsi -; X64-NEXT: adcq %r8, %r13 +; X64-NEXT: addq %r8, %rsi +; X64-NEXT: adcq %r9, %r13 ; X64-NEXT: adcq $0, %r15 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: movq %r10, %rbx @@ -4537,28 +4534,25 @@ ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: adcq $0, %rdi -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload -; X64-NEXT: movq 24(%rax), %rcx +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload +; X64-NEXT: movq 24(%rax), %r14 ; X64-NEXT: movq %rbx, %rax -; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %rbx -; X64-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rbp, %r8 ; X64-NEXT: adcq %rdi, %rcx -; X64-NEXT: setb %dil +; X64-NEXT: setb %bl ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rbx +; X64-NEXT: mulq %r14 ; X64-NEXT: addq %rcx, %rax -; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: movzbl %bl, %ecx ; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r14 # 8-byte Reload -; X64-NEXT: addq %r14, %rbp -; X64-NEXT: movq (%rsp), %rbx # 8-byte Reload ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r9 # 8-byte Reload -; X64-NEXT: adcq %r9, %rbx +; X64-NEXT: addq %r9, %rbp +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload +; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload ; X64-NEXT: addq %rax, %rbp ; X64-NEXT: adcq %rdx, %rbx ; X64-NEXT: addq %rsi, %r10 @@ -4584,23 +4578,22 @@ ; X64-NEXT: addq %r11, %rdi ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 # 8-byte Reload -; X64-NEXT: mulq %r8 +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %rdi, %r11 ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil ; X64-NEXT: movq %r12, %rax -; X64-NEXT: mulq %r8 -; X64-NEXT: movq %r8, %r12 +; X64-NEXT: mulq %r14 +; X64-NEXT: movq %r14, %r12 ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movzbl %sil, %ecx ; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; X64-NEXT: addq %r14, %rcx +; X64-NEXT: addq %r9, %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 # 8-byte Reload -; X64-NEXT: adcq %r9, %r14 +; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload ; X64-NEXT: addq %rax, %rcx ; X64-NEXT: adcq %rdx, %r14 ; X64-NEXT: addq %rbp, %r13 @@ -4622,7 +4615,7 @@ ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rax, %r14 ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload ; X64-NEXT: movq 24(%rax), %rcx ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill @@ -4711,7 +4704,7 @@ ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload +; X64-NEXT: movq (%rsp), %rdi # 8-byte Reload ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r13 # 8-byte Reload ; X64-NEXT: addq %r13, %rdi ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload @@ -4741,6 +4734,7 @@ ; X64-NEXT: addq %r10, %rbx ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r15 @@ -4861,7 +4855,7 @@ ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %r14, %rcx ; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload ; X64-NEXT: movq 56(%rax), %rdi ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rdi @@ -4876,8 +4870,8 @@ ; X64-NEXT: addq %rsi, %rax ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r11 # 8-byte Reload +; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload ; X64-NEXT: addq %r11, %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload @@ -4927,7 +4921,8 @@ ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload ; X64-NEXT: addq %rcx, %rdx ; X64-NEXT: adcq %rsi, %r8 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax # 1-byte Folded Reload +; X64-NEXT: movb {{[0-9]+}}(%rsp), %al # 1-byte Reload +; X64-NEXT: movzbl %al, %eax ; X64-NEXT: adcq %rax, %r15 ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: addq {{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload @@ -4993,7 +4988,7 @@ ; X64-NEXT: movq %r10, %rdi ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r11 -; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload ; X64-NEXT: movq %rsi, %rax ; X64-NEXT: mulq %rdi @@ -5006,7 +5001,7 @@ ; X64-NEXT: mulq %r9 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rbx, %rax -; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq %rdi, %rcx ; X64-NEXT: setb %bl ; X64-NEXT: movq %rsi, %rax @@ -5053,9 +5048,9 @@ ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r13 # 8-byte Reload +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload ; X64-NEXT: addq %r13, %rsi -; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 # 8-byte Reload ; X64-NEXT: adcq %r14, %rcx ; X64-NEXT: addq %rax, %rsi @@ -5068,7 +5063,7 @@ ; X64-NEXT: addq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload ; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload -; X64-NEXT: movq %rcx, (%rsp) # 8-byte Spill +; X64-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: setb -{{[0-9]+}}(%rsp) # 1-byte Folded Spill ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx # 8-byte Reload ; X64-NEXT: movq %rbx, %rax @@ -5103,8 +5098,9 @@ ; X64-NEXT: adcq %rdx, %rcx ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r14 # 8-byte Reload ; X64-NEXT: addq -{{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload -; X64-NEXT: adcq (%rsp), %r10 # 8-byte Folded Reload -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax # 1-byte Folded Reload +; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %r10 # 8-byte Folded Reload +; X64-NEXT: movb -{{[0-9]+}}(%rsp), %al # 1-byte Reload +; X64-NEXT: movzbl %al, %eax ; X64-NEXT: adcq %rax, %rsi ; X64-NEXT: adcq $0, %rcx ; X64-NEXT: addq {{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload @@ -5112,32 +5108,33 @@ ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; X64-NEXT: addq %rax, {{[0-9]+}}(%rsp) # 8-byte Folded Spill +; X64-NEXT: addq %rax, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; X64-NEXT: adcq %rax, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill +; X64-NEXT: adcq %rax, {{[0-9]+}}(%rsp) # 8-byte Folded Spill ; X64-NEXT: adcq %r15, %r12 ; X64-NEXT: movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq %rbp, %r11 -; X64-NEXT: movq %r11, (%rsp) # 8-byte Spill -; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax # 1-byte Folded Reload +; X64-NEXT: movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movb -{{[0-9]+}}(%rsp), %al # 1-byte Reload +; X64-NEXT: movzbl %al, %eax ; X64-NEXT: adcq %rax, %r14 ; X64-NEXT: movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq $0, %r10 -; X64-NEXT: movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %r10, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; X64-NEXT: movq 64(%rcx), %r11 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload +; X64-NEXT: movq 64(%rcx), %r10 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r9 # 8-byte Reload ; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rsi, %rbx @@ -5153,48 +5150,47 @@ ; X64-NEXT: setb %bl ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rcx -; X64-NEXT: movq %rcx, %r10 -; X64-NEXT: movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rcx, %r12 +; X64-NEXT: movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: addq %rsi, %rdi ; X64-NEXT: movzbl %bl, %eax ; X64-NEXT: adcq %rax, %rcx -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r10, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: mulq %rdx ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r12 # 8-byte Reload -; X64-NEXT: addq %rbx, %r12 +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r11 # 8-byte Reload +; X64-NEXT: addq %rbx, %r11 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r15 # 8-byte Reload ; X64-NEXT: adcq %r14, %r15 -; X64-NEXT: addq %rdi, %r12 +; X64-NEXT: addq %rdi, %r11 ; X64-NEXT: adcq %rcx, %r15 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %r11, %rsi -; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: mulq %rsi -; X64-NEXT: movq %rdx, %r11 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload +; X64-NEXT: movq %r13, %rax +; X64-NEXT: movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 # 8-byte Reload -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %rsi +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: addq %r11, %rdi +; X64-NEXT: addq %r9, %rdi ; X64-NEXT: adcq $0, %rsi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %rcx, %r11 -; X64-NEXT: mulq %r10 +; X64-NEXT: movq %r13, %rax +; X64-NEXT: mulq %r12 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %sil -; X64-NEXT: movq %r9, %rax -; X64-NEXT: mulq %r10 +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %rbp, %r9 +; X64-NEXT: mulq %r12 ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movzbl %sil, %ecx ; X64-NEXT: adcq %rcx, %rdx @@ -5202,16 +5198,16 @@ ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %r14 # 8-byte Folded Reload ; X64-NEXT: addq %rax, %rbx ; X64-NEXT: adcq %rdx, %r14 -; X64-NEXT: addq %r13, %rbx +; X64-NEXT: addq -{{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload ; X64-NEXT: adcq %r8, %r14 -; X64-NEXT: adcq $0, %r12 +; X64-NEXT: adcq $0, %r11 ; X64-NEXT: adcq $0, %r15 -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload ; X64-NEXT: movq 80(%rbp), %rdi -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %r8 -; X64-NEXT: movq %rax, %r13 +; X64-NEXT: movq %rax, %r12 ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rsi @@ -5219,39 +5215,39 @@ ; X64-NEXT: addq %r8, %rcx ; X64-NEXT: adcq $0, %rsi ; X64-NEXT: movq 88(%rbp), %r10 -; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r13, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rbp ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rcx, %r8 ; X64-NEXT: adcq %rsi, %rbp -; X64-NEXT: setb %r11b +; X64-NEXT: setb %r13b ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r10 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %rbp, %rsi -; X64-NEXT: movzbl %r11b, %eax +; X64-NEXT: movzbl %r13b, %eax ; X64-NEXT: adcq %rax, %rcx ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: xorl %edx, %edx ; X64-NEXT: mulq %rdx -; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload -; X64-NEXT: addq %r9, %rbp +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx # 8-byte Reload +; X64-NEXT: addq %r9, %rdx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload -; X64-NEXT: adcq %rdx, %rax -; X64-NEXT: addq %rsi, %rbp +; X64-NEXT: adcq %r13, %rax +; X64-NEXT: addq %rsi, %rdx ; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: addq %rbx, %r13 -; X64-NEXT: movq %r13, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: addq %rbx, %r12 +; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq %r14, %r8 ; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: adcq $0, %rbp +; X64-NEXT: adcq $0, %rdx ; X64-NEXT: adcq $0, %rax -; X64-NEXT: addq %r12, %rbp -; X64-NEXT: movq %rbp, %r8 +; X64-NEXT: addq %r11, %rdx +; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: adcq %r15, %rax ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: setb %r14b @@ -5282,7 +5278,7 @@ ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload ; X64-NEXT: addq %r9, %rsi ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload +; X64-NEXT: adcq %r13, %rcx ; X64-NEXT: addq %rax, %rsi ; X64-NEXT: adcq %rdx, %rcx ; X64-NEXT: addq %r8, %r12 @@ -5344,7 +5340,7 @@ ; X64-NEXT: adcq %rax, %r12 ; X64-NEXT: addq %r9, %r13 ; X64-NEXT: adcq %r8, %r12 -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx # 8-byte Reload +; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdx # 8-byte Reload ; X64-NEXT: movq 120(%rdx), %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 # 8-byte Reload ; X64-NEXT: imulq %r10, %rcx @@ -5404,10 +5400,10 @@ ; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload ; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload -; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload ; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload ; X64-NEXT: movq 80(%rsi), %rdi ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload @@ -5446,7 +5442,7 @@ ; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rax, %rsi -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 # 8-byte Reload +; X64-NEXT: movq (%rsp), %r12 # 8-byte Reload ; X64-NEXT: addq %r12, %rsi ; X64-NEXT: movq %rdx, %r10 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r8 # 8-byte Reload @@ -5476,11 +5472,11 @@ ; X64-NEXT: movq %r9, %rax ; X64-NEXT: movq %r9, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: mulq %r15 -; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rcx, %rbp +; X64-NEXT: movq %rdx, %rbp +; X64-NEXT: movq %rax, %rbx +; X64-NEXT: addq %rcx, %rbx ; X64-NEXT: movzbl %r11b, %eax -; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: adcq %rax, %rbp ; X64-NEXT: movq %r13, %rax ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: mulq %rcx @@ -5489,10 +5485,10 @@ ; X64-NEXT: movq %r12, %rcx ; X64-NEXT: addq %r15, %rcx ; X64-NEXT: adcq %r11, %r8 -; X64-NEXT: addq %rbp, %rcx -; X64-NEXT: adcq %rbx, %r8 +; X64-NEXT: addq %rbx, %rcx +; X64-NEXT: adcq %rbp, %r8 ; X64-NEXT: addq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload -; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rcx, (%rsp) # 8-byte Spill ; X64-NEXT: adcq %r14, %r8 ; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq $0, %rsi @@ -5502,7 +5498,7 @@ ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rax, %r12 +; X64-NEXT: movq %rax, %r14 ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: movq %rdi, %r8 @@ -5523,14 +5519,14 @@ ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: adcq %rcx, %rdx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 # 8-byte Reload -; X64-NEXT: addq %r14, %r15 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12 # 8-byte Reload +; X64-NEXT: addq %r12, %r15 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload ; X64-NEXT: adcq %r13, %r11 ; X64-NEXT: addq %rax, %r15 ; X64-NEXT: adcq %rdx, %r11 -; X64-NEXT: addq {{[0-9]+}}(%rsp), %r12 # 8-byte Folded Reload -; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: addq (%rsp), %r14 # 8-byte Folded Reload +; X64-NEXT: movq %r14, (%rsp) # 8-byte Spill ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload ; X64-NEXT: movq %rbp, {{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq $0, %r15 @@ -5547,7 +5543,7 @@ ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rdi, %r14 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rbx ; X64-NEXT: addq %rcx, %rbx @@ -5567,7 +5563,7 @@ ; X64-NEXT: movzbl %r8b, %ecx ; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload -; X64-NEXT: addq %r14, %rsi +; X64-NEXT: addq %r12, %rsi ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload ; X64-NEXT: adcq %r13, %rcx ; X64-NEXT: addq %rax, %rsi @@ -5581,11 +5577,11 @@ ; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq $0, %rcx ; X64-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload ; X64-NEXT: movq 96(%rbp), %rcx ; X64-NEXT: imulq %rcx, %rdi ; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %r12, %rsi +; X64-NEXT: movq %r14, %rsi ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rdi, %rdx @@ -5611,7 +5607,7 @@ ; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %rbx -; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rbp, %rax ; X64-NEXT: movq %rbp, %r9 ; X64-NEXT: mulq %rcx @@ -5643,25 +5639,25 @@ ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: addq %rsi, %rdx -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r11 # 8-byte Reload -; X64-NEXT: imulq %r11, %rcx +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r14 # 8-byte Reload +; X64-NEXT: imulq %r14, %rcx ; X64-NEXT: addq %rdx, %rcx ; X64-NEXT: movq %rcx, %r9 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r15 # 8-byte Reload ; X64-NEXT: imulq %r15, %rcx -; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14 # 8-byte Reload -; X64-NEXT: mulq %r14 +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r11 # 8-byte Reload +; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rax, %r10 ; X64-NEXT: addq %rcx, %rdx ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload -; X64-NEXT: imulq %r14, %rax +; X64-NEXT: imulq %r11, %rax ; X64-NEXT: addq %rdx, %rax ; X64-NEXT: addq %r8, %r10 ; X64-NEXT: adcq %r9, %rax ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq %r14, %rax +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %r13 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r8 @@ -5671,21 +5667,21 @@ ; X64-NEXT: movq %rax, %rcx ; X64-NEXT: addq %rdi, %rcx ; X64-NEXT: adcq $0, %r9 -; X64-NEXT: movq %r14, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: movq %r11, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %rcx, %rsi ; X64-NEXT: adcq %r9, %rdi ; X64-NEXT: setb %cl ; X64-NEXT: movq %r15, %rax -; X64-NEXT: mulq %r11 +; X64-NEXT: mulq %r14 ; X64-NEXT: addq %rdi, %rax ; X64-NEXT: movzbl %cl, %ecx ; X64-NEXT: adcq %rcx, %rdx ; X64-NEXT: addq %r10, %rax ; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload -; X64-NEXT: addq {{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload +; X64-NEXT: addq -{{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload ; X64-NEXT: adcq %r12, %rsi ; X64-NEXT: adcq %rbp, %rax ; X64-NEXT: adcq %rbx, %rdx @@ -5697,22 +5693,22 @@ ; X64-NEXT: addq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload -; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload +; X64-NEXT: movq (%rsp), %rbp # 8-byte Reload ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx # 8-byte Reload ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload ; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload -; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload +; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload -; X64-NEXT: addq {{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload +; X64-NEXT: addq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload ; X64-NEXT: movq %rcx, %r9 -; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload +; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdi # 8-byte Folded Reload ; X64-NEXT: movq %rdi, %r10 ; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rbp # 8-byte Folded Reload -; X64-NEXT: adcq (%rsp), %rbx # 8-byte Folded Reload +; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rbx # 8-byte Folded Reload ; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %r8 # 8-byte Folded Reload -; X64-NEXT: adcq -{{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload +; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rsi # 8-byte Folded Reload ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rax # 8-byte Folded Reload ; X64-NEXT: adcq {{[0-9]+}}(%rsp), %rdx # 8-byte Folded Reload ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload Index: test/CodeGen/X86/mul-i512.ll =================================================================== --- test/CodeGen/X86/mul-i512.ll +++ test/CodeGen/X86/mul-i512.ll @@ -902,37 +902,35 @@ ; X64-NEXT: pushq %rbx ; X64-NEXT: pushq %rax ; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill -; X64-NEXT: movq 24(%rdi), %r11 -; X64-NEXT: movq 16(%rdi), %r15 +; X64-NEXT: movq 24(%rdi), %rbp +; X64-NEXT: movq 16(%rdi), %r11 ; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq (%rsi), %rdx -; X64-NEXT: movq 8(%rsi), %rbp -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq 8(%rsi), %r14 +; X64-NEXT: movq %r11, %rax ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rdx, %r9 ; X64-NEXT: movq %rax, %r8 -; X64-NEXT: movq %r11, %rax -; X64-NEXT: movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: mulq %rsi ; X64-NEXT: movq %rsi, %r10 ; X64-NEXT: movq %rdx, %rbx ; X64-NEXT: movq %rax, %rsi ; X64-NEXT: addq %r9, %rsi ; X64-NEXT: adcq $0, %rbx -; X64-NEXT: movq %r15, %rax -; X64-NEXT: movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: mulq %rbp +; X64-NEXT: movq %r11, %rax +; X64-NEXT: movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rax, %r9 ; X64-NEXT: addq %rsi, %r9 ; X64-NEXT: adcq %rbx, %rcx ; X64-NEXT: setb %al ; X64-NEXT: movzbl %al, %ebx -; X64-NEXT: movq %r11, %rax -; X64-NEXT: mulq %rbp -; X64-NEXT: movq %rbp, %r14 -; X64-NEXT: movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill +; X64-NEXT: movq %rbp, %rax +; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbp ; X64-NEXT: addq %rcx, %rbp @@ -944,7 +942,7 @@ ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r13 ; X64-NEXT: movq %rax, %r10 -; X64-NEXT: movq %r15, %rax +; X64-NEXT: movq %r11, %rax ; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: movq %rax, %r15 @@ -969,22 +967,25 @@ ; X64-NEXT: addq %r11, %rsi ; X64-NEXT: adcq $0, %rbp ; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: mulq %r14 -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: addq %rsi, %rax ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: adcq %rbp, %rbx +; X64-NEXT: adcq %rbp, %rcx ; X64-NEXT: setb %r11b ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %rbp -; X64-NEXT: addq %rbx, %rbp +; X64-NEXT: addq %rcx, %rbp ; X64-NEXT: movzbl %r11b, %eax ; X64-NEXT: adcq %rax, %rsi -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: xorl %edx, %edx -; X64-NEXT: mulq %rdx +; X64-NEXT: movq %rbx, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: mulq %rcx ; X64-NEXT: movq %rdx, %r14 ; X64-NEXT: movq %rax, %r11 ; X64-NEXT: addq %r11, %r10 @@ -996,36 +997,36 @@ ; X64-NEXT: adcq $0, %r15 ; X64-NEXT: adcq $0, %r12 ; X64-NEXT: movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload -; X64-NEXT: movq 16(%rsi), %r8 -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: movq %rcx, %r9 +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload +; X64-NEXT: movq 16(%rcx), %r8 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: movq %rdi, %r9 ; X64-NEXT: movq %r9, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rdi ; X64-NEXT: movq %rax, %r12 -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %r8 -; X64-NEXT: movq %rdx, %rbp -; X64-NEXT: movq %rax, %rbx -; X64-NEXT: addq %rdi, %rbx -; X64-NEXT: adcq $0, %rbp -; X64-NEXT: movq 24(%rsi), %rdi +; X64-NEXT: movq %rdx, %rsi +; X64-NEXT: movq %rax, %rbp +; X64-NEXT: addq %rdi, %rbp +; X64-NEXT: adcq $0, %rsi +; X64-NEXT: movq 24(%rcx), %rdi ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rsi -; X64-NEXT: addq %rbx, %rax +; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: addq %rbp, %rax ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill -; X64-NEXT: adcq %rbp, %rsi +; X64-NEXT: adcq %rsi, %rcx ; X64-NEXT: setb %bpl -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rbx, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r9 -; X64-NEXT: addq %rsi, %r9 +; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: movzbl %bpl, %eax -; X64-NEXT: adcq %rax, %rbx +; X64-NEXT: adcq %rax, %rsi ; X64-NEXT: movq %r8, %rax ; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: mulq %rcx @@ -1034,7 +1035,7 @@ ; X64-NEXT: addq %rbp, %r11 ; X64-NEXT: adcq %rdx, %r14 ; X64-NEXT: addq %r9, %r11 -; X64-NEXT: adcq %rbx, %r14 +; X64-NEXT: adcq %rsi, %r14 ; X64-NEXT: addq %r10, %r12 ; X64-NEXT: movq %r12, -{{[0-9]+}}(%rsp) # 8-byte Spill ; X64-NEXT: adcq %r13, -{{[0-9]+}}(%rsp) # 8-byte Folded Spill Index: test/CodeGen/X86/musttail-varargs.ll =================================================================== --- test/CodeGen/X86/musttail-varargs.ll +++ test/CodeGen/X86/musttail-varargs.ll @@ -129,8 +129,8 @@ ; LINUX: jmpq *%rdi # TAILCALL ; LINUX-X32-LABEL: g_thunk: -; LINUX-X32-DAG: movl %edi, %[[REG:e[abcd]x|ebp|esi|edi|r8|r9|r1[0-5]]] -; LINUX-X32-DAG: jmpq *%[[REG]] # TAILCALL +; LINUX-X32-DAG: movl %edi, %edi +; LINUX-X32-DAG: jmpq *%rdi # TAILCALL ; WINDOWS-LABEL: g_thunk: ; WINDOWS-NOT: movq Index: test/CodeGen/X86/pr32340.ll =================================================================== --- test/CodeGen/X86/pr32340.ll +++ test/CodeGen/X86/pr32340.ll @@ -26,27 +26,27 @@ ; X64-NEXT: addl %eax, %esi ; X64-NEXT: movslq %esi, %r8 ; X64-NEXT: movq %r8, var_826 -; X64-NEXT: movzwl var_32, %eax -; X64-NEXT: movl %eax, %r8d +; X64-NEXT: movw var_32, %r9w +; X64-NEXT: movzwl %r9w, %r8d ; X64-NEXT: movzwl var_901, %eax ; X64-NEXT: xorl $51981, %eax # imm = 0xCB0D -; X64-NEXT: movslq %eax, %r9 -; X64-NEXT: xorq %rdx, %r9 +; X64-NEXT: movslq %eax, %r10 +; X64-NEXT: xorq %rdx, %r10 ; X64-NEXT: movq %r8, %rdx -; X64-NEXT: xorq %r9, %rdx +; X64-NEXT: xorq %r10, %rdx ; X64-NEXT: xorq $-1, %rdx ; X64-NEXT: xorq %rdx, %r8 ; X64-NEXT: movq %r8, %rdx ; X64-NEXT: orq var_57, %rdx ; X64-NEXT: orq %rdx, %r8 -; X64-NEXT: movw %r8w, %r10w -; X64-NEXT: movw %r10w, var_900 +; X64-NEXT: movw %r8w, %r9w +; X64-NEXT: movw %r9w, var_900 ; X64-NEXT: cmpq var_28, %rcx ; X64-NEXT: setne %r11b ; X64-NEXT: andb $1, %r11b ; X64-NEXT: movzbl %r11b, %eax -; X64-NEXT: movw %ax, %r10w -; X64-NEXT: movw %r10w, var_827 +; X64-NEXT: movw %ax, %r9w +; X64-NEXT: movw %r9w, var_827 ; X64-NEXT: retq entry: store i16 0, i16* @var_825, align 2 Index: test/CodeGen/X86/pr32345.ll =================================================================== --- test/CodeGen/X86/pr32345.ll +++ test/CodeGen/X86/pr32345.ll @@ -27,7 +27,6 @@ ; X640-NEXT: movzwl var_27, %ecx ; X640-NEXT: subl $16610, %ecx # imm = 0x40E2 ; X640-NEXT: movl %ecx, %ecx -; X640-NEXT: # kill: %RCX %ECX ; X640-NEXT: # kill: %CL %RCX ; X640-NEXT: sarq %cl, %rsi ; X640-NEXT: movb %sil, %cl Index: test/CodeGen/X86/tail-dup-merge-loop-headers.ll =================================================================== --- test/CodeGen/X86/tail-dup-merge-loop-headers.ll +++ test/CodeGen/X86/tail-dup-merge-loop-headers.ll @@ -73,11 +73,11 @@ ; CHECK-LABEL: loop_shared_header ; CHECK: # %entry ; CHECK: # %shared_preheader -; CHECK: # %shared_loop_header -; CHECK: # %inner_loop_body ; CHECK: # %outer_loop_latch ; CHECK: # %merge_predecessor_split ; CHECK: # %outer_loop_latch +; CHECK: # %shared_loop_header +; CHECK: # %inner_loop_body ; CHECK: # %cleanup define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i32 %wwprva, i32 %e_lfanew, i8* readonly %wwp, i32 %wwpsz, i16 zeroext %sects) local_unnamed_addr #0 { entry: