diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2148,9 +2148,11 @@ setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); setLibcallName(RTLIB::MUL_I128, nullptr); + // The MULO libcall is not part of libgcc, only compiler-rt. setLibcallName(RTLIB::MULO_I64, nullptr); - setLibcallName(RTLIB::MULO_I128, nullptr); } + // The MULO libcall is not part of libgcc, only compiler-rt. + setLibcallName(RTLIB::MULO_I128, nullptr); // Combine sin / cos into _sincos_stret if it is available. if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && diff --git a/llvm/test/CodeGen/X86/muloti.ll b/llvm/test/CodeGen/X86/muloti.ll --- a/llvm/test/CodeGen/X86/muloti.ll +++ b/llvm/test/CodeGen/X86/muloti.ll @@ -1,9 +1,84 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s %0 = type { i64, i64 } %1 = type { i128, i1 } +; This used to call muloti4, but that won't link with libgcc. define %0 @x(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) nounwind uwtable ssp { -; CHECK: x +; CHECK-LABEL: x: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset %rbx, -32 +; CHECK-NEXT: .cfi_offset %r14, -24 +; CHECK-NEXT: .cfi_offset %r15, -16 +; CHECK-NEXT: movq %rdx, %r11 +; CHECK-NEXT: movq %rsi, %r9 +; CHECK-NEXT: movq %rdi, %r15 +; CHECK-NEXT: sarq $63, %rsi +; CHECK-NEXT: movq %rdx, %rdi +; CHECK-NEXT: imulq %rsi, %rdi +; CHECK-NEXT: movq %rdx, %rax +; CHECK-NEXT: mulq %rsi +; CHECK-NEXT: movq %rax, %r8 +; CHECK-NEXT: addq %rdi, %rdx +; CHECK-NEXT: imulq %rcx, %rsi +; CHECK-NEXT: addq %rdx, %rsi +; CHECK-NEXT: movq %rcx, %rdi +; CHECK-NEXT: sarq $63, %rdi +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: imulq %r9, %rbx +; CHECK-NEXT: movq %rdi, %rax +; CHECK-NEXT: mulq %r15 +; CHECK-NEXT: movq %rax, %r10 +; CHECK-NEXT: addq %rbx, %rdx +; CHECK-NEXT: imulq %r15, %rdi +; CHECK-NEXT: addq %rdx, %rdi +; CHECK-NEXT: addq %r8, %r10 +; CHECK-NEXT: adcq %rsi, %rdi +; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: mulq %r11 +; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movq %rax, %r8 +; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: mulq %r11 +; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movq %rax, %rsi +; CHECK-NEXT: addq %r14, %rsi +; CHECK-NEXT: adcq $0, %rbx +; CHECK-NEXT: movq %r15, %rax +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: movq %rdx, %r14 +; CHECK-NEXT: movq %rax, %r11 +; CHECK-NEXT: addq %rsi, %r11 +; CHECK-NEXT: adcq %rbx, %r14 +; CHECK-NEXT: setb %al +; CHECK-NEXT: movzbl %al, %esi +; CHECK-NEXT: movq %r9, %rax +; CHECK-NEXT: mulq %rcx +; CHECK-NEXT: addq %r14, %rax +; CHECK-NEXT: adcq %rsi, %rdx +; CHECK-NEXT: addq %r10, %rax +; CHECK-NEXT: adcq %rdi, %rdx +; CHECK-NEXT: movq %r11, %rcx +; CHECK-NEXT: sarq $63, %rcx +; CHECK-NEXT: xorq %rcx, %rdx +; CHECK-NEXT: xorq %rax, %rcx +; CHECK-NEXT: orq %rdx, %rcx +; CHECK-NEXT: jne LBB0_1 +; CHECK-NEXT: ## %bb.2: ## %nooverflow +; CHECK-NEXT: movq %r8, %rax +; CHECK-NEXT: movq %r11, %rdx +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: retq +; CHECK-NEXT: LBB0_1: ## %overflow +; CHECK-NEXT: ud2 entry: %tmp16 = zext i64 %a.coerce0 to i128 %tmp11 = zext i64 %a.coerce1 to i128 @@ -14,7 +89,6 @@ %tmp4 = shl nuw i128 %tmp3, 64 %ins = or i128 %tmp4, %tmp6 %0 = tail call %1 @llvm.smul.with.overflow.i128(i128 %ins14, i128 %ins) -; CHECK: callq ___muloti4 %1 = extractvalue %1 %0, 0 %2 = extractvalue %1 %0, 1 br i1 %2, label %overflow, label %nooverflow diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -5,22 +5,77 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, i128* %res) { ; X64-LABEL: smuloi128: ; X64: ## %bb.0: -; X64-NEXT: pushq %rbx +; X64-NEXT: pushq %r15 ; X64-NEXT: .cfi_def_cfa_offset 16 -; X64-NEXT: subq $16, %rsp +; X64-NEXT: pushq %r14 +; X64-NEXT: .cfi_def_cfa_offset 24 +; X64-NEXT: pushq %r12 ; X64-NEXT: .cfi_def_cfa_offset 32 -; X64-NEXT: .cfi_offset %rbx, -16 -; X64-NEXT: movq %r8, %rbx -; X64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; X64-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; X64-NEXT: callq ___muloti4 -; X64-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; X64-NEXT: setne %cl -; X64-NEXT: movq %rdx, 8(%rbx) -; X64-NEXT: movq %rax, (%rbx) -; X64-NEXT: movl %ecx, %eax -; X64-NEXT: addq $16, %rsp +; X64-NEXT: pushq %rbx +; X64-NEXT: .cfi_def_cfa_offset 40 +; X64-NEXT: .cfi_offset %rbx, -40 +; X64-NEXT: .cfi_offset %r12, -32 +; X64-NEXT: .cfi_offset %r14, -24 +; X64-NEXT: .cfi_offset %r15, -16 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rsi, %r10 +; X64-NEXT: movq %rdi, %r15 +; X64-NEXT: sarq $63, %rsi +; X64-NEXT: movq %rdx, %rdi +; X64-NEXT: imulq %rsi, %rdi +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: mulq %rsi +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: addq %rdi, %rdx +; X64-NEXT: imulq %rcx, %rsi +; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: sarq $63, %rdi +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: imulq %r10, %rbx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r15 +; X64-NEXT: movq %rax, %r11 +; X64-NEXT: addq %rbx, %rdx +; X64-NEXT: imulq %r15, %rdi +; X64-NEXT: addq %rdx, %rdi +; X64-NEXT: addq %r9, %r11 +; X64-NEXT: adcq %rsi, %rdi +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r9 +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %r12 +; X64-NEXT: movq %rdx, %rbx +; X64-NEXT: movq %rax, %rsi +; X64-NEXT: addq %r14, %rsi +; X64-NEXT: adcq $0, %rbx +; X64-NEXT: movq %r15, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rax, %r15 +; X64-NEXT: addq %rsi, %r15 +; X64-NEXT: adcq %rbx, %r14 +; X64-NEXT: setb %al +; X64-NEXT: movzbl %al, %esi +; X64-NEXT: movq %r10, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: addq %r14, %rax +; X64-NEXT: adcq %rsi, %rdx +; X64-NEXT: addq %r11, %rax +; X64-NEXT: adcq %rdi, %rdx +; X64-NEXT: movq %r15, 8(%r8) +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: xorq %r15, %rdx +; X64-NEXT: xorq %rax, %r15 +; X64-NEXT: orq %rdx, %r15 +; X64-NEXT: setne %al +; X64-NEXT: movq %r9, (%r8) ; X64-NEXT: popq %rbx +; X64-NEXT: popq %r12 +; X64-NEXT: popq %r14 +; X64-NEXT: popq %r15 ; X64-NEXT: retq ; ; X86-LABEL: smuloi128: diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -3321,39 +3321,127 @@ ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: subq $24, %rsp +; SSE2-NEXT: movq %r8, %r14 +; SSE2-NEXT: movq %rcx, %r11 +; SSE2-NEXT: movq %rdx, %r15 +; SSE2-NEXT: movq %rsi, %r13 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE2-NEXT: movq %rsi, %rcx +; SSE2-NEXT: sarq $63, %rcx +; SSE2-NEXT: movq %r14, %rsi +; SSE2-NEXT: imulq %rcx, %rsi +; SSE2-NEXT: movq %r14, %rax +; SSE2-NEXT: mulq %rcx +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: addq %rsi, %rdx +; SSE2-NEXT: imulq %r9, %rcx +; SSE2-NEXT: addq %rdx, %rcx +; SSE2-NEXT: movq %r9, %rbx +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: movq %rbx, %rsi +; SSE2-NEXT: imulq %r13, %rsi +; SSE2-NEXT: movq %rbx, %rax +; SSE2-NEXT: mulq %rdi +; SSE2-NEXT: movq %rax, %r12 +; SSE2-NEXT: addq %rsi, %rdx +; SSE2-NEXT: imulq %rdi, %rbx +; SSE2-NEXT: addq %rdx, %rbx +; SSE2-NEXT: addq %r10, %r12 +; SSE2-NEXT: adcq %rcx, %rbx +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: mulq %r14 +; SSE2-NEXT: movq %rdx, %rbp +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movq %r13, %rax +; SSE2-NEXT: mulq %r14 +; SSE2-NEXT: movq %rdx, %rsi +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: addq %rbp, %rcx +; SSE2-NEXT: adcq $0, %rsi +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: mulq %r9 +; SSE2-NEXT: movq %rdx, %rbp +; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: addq %rcx, %rdi +; SSE2-NEXT: adcq %rsi, %rbp +; SSE2-NEXT: setb %al +; SSE2-NEXT: movzbl %al, %ecx +; SSE2-NEXT: movq %r13, %rax +; SSE2-NEXT: mulq %r9 +; SSE2-NEXT: addq %rbp, %rax +; SSE2-NEXT: adcq %rcx, %rdx +; SSE2-NEXT: addq %r12, %rax +; SSE2-NEXT: adcq %rbx, %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; SSE2-NEXT: movq %rdi, 8(%r13) +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: xorq %rdi, %rdx +; SSE2-NEXT: xorq %rax, %rdi +; SSE2-NEXT: xorl %r12d, %r12d +; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: setne %r12b +; SSE2-NEXT: movq %r11, %rdi +; SSE2-NEXT: sarq $63, %rdi ; SSE2-NEXT: movq %r8, %rax -; SSE2-NEXT: movq %rcx, %r14 +; SSE2-NEXT: movq %r8, %rsi +; SSE2-NEXT: imulq %rdi, %rsi +; SSE2-NEXT: movq %r8, %rbx +; SSE2-NEXT: mulq %rdi +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: addq %rsi, %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE2-NEXT: imulq %r8, %rdi +; SSE2-NEXT: addq %rdx, %rdi +; SSE2-NEXT: movq %r8, %rsi +; SSE2-NEXT: sarq $63, %rsi +; SSE2-NEXT: movq %rsi, %rbp +; SSE2-NEXT: imulq %r11, %rbp +; SSE2-NEXT: movq %rsi, %rax +; SSE2-NEXT: mulq %r15 +; SSE2-NEXT: movq %rax, %r14 +; SSE2-NEXT: addq %rbp, %rdx +; SSE2-NEXT: imulq %r15, %rsi +; SSE2-NEXT: addq %rdx, %rsi +; SSE2-NEXT: addq %rcx, %r14 +; SSE2-NEXT: adcq %rdi, %rsi +; SSE2-NEXT: movq %r15, %rax +; SSE2-NEXT: mulq %rbx +; SSE2-NEXT: movq %rdx, %rcx +; SSE2-NEXT: movq %rax, %r9 +; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: mulq %rbx ; SSE2-NEXT: movq %rdx, %rbx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: movq %r9, %rcx -; SSE2-NEXT: callq __muloti4@PLT -; SSE2-NEXT: movq %rax, %r13 -; SSE2-NEXT: movq %rdx, %rbp -; SSE2-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE2-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: movq %rbx, %rdi -; SSE2-NEXT: movq %r14, %rsi -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE2-NEXT: movq %r12, %rcx -; SSE2-NEXT: callq __muloti4@PLT -; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: sbbl %esi, %esi -; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: sbbl %ecx, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movq %rax, %rbp +; SSE2-NEXT: addq %rcx, %rbp +; SSE2-NEXT: adcq $0, %rbx +; SSE2-NEXT: movq %r15, %rax +; SSE2-NEXT: mulq %r8 +; SSE2-NEXT: movq %rdx, %rcx +; SSE2-NEXT: movq %rax, %rdi +; SSE2-NEXT: addq %rbp, %rdi +; SSE2-NEXT: adcq %rbx, %rcx +; SSE2-NEXT: setb %al +; SSE2-NEXT: movzbl %al, %ebp +; SSE2-NEXT: movq %r11, %rax +; SSE2-NEXT: mulq %r8 +; SSE2-NEXT: addq %rcx, %rax +; SSE2-NEXT: adcq %rbp, %rdx +; SSE2-NEXT: addq %r14, %rax +; SSE2-NEXT: adcq %rsi, %rdx +; SSE2-NEXT: movq %rdi, 24(%r13) +; SSE2-NEXT: sarq $63, %rdi +; SSE2-NEXT: xorq %rdi, %rdx +; SSE2-NEXT: xorq %rax, %rdi +; SSE2-NEXT: xorl %eax, %eax +; SSE2-NEXT: orq %rdx, %rdi +; SSE2-NEXT: setne %al +; SSE2-NEXT: negl %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: negl %r12d +; SSE2-NEXT: movd %r12d, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq %rdx, 24(%r15) -; SSE2-NEXT: movq %rax, 16(%r15) -; SSE2-NEXT: movq %rbp, 8(%r15) -; SSE2-NEXT: movq %r13, (%r15) -; SSE2-NEXT: addq $24, %rsp +; SSE2-NEXT: movq %r9, 16(%r13) +; SSE2-NEXT: movq %r10, (%r13) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -3370,39 +3458,127 @@ ; SSSE3-NEXT: pushq %r13 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: subq $24, %rsp +; SSSE3-NEXT: movq %r8, %r14 +; SSSE3-NEXT: movq %rcx, %r11 +; SSSE3-NEXT: movq %rdx, %r15 +; SSSE3-NEXT: movq %rsi, %r13 +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSSE3-NEXT: movq %rsi, %rcx +; SSSE3-NEXT: sarq $63, %rcx +; SSSE3-NEXT: movq %r14, %rsi +; SSSE3-NEXT: imulq %rcx, %rsi +; SSSE3-NEXT: movq %r14, %rax +; SSSE3-NEXT: mulq %rcx +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: addq %rsi, %rdx +; SSSE3-NEXT: imulq %r9, %rcx +; SSSE3-NEXT: addq %rdx, %rcx +; SSSE3-NEXT: movq %r9, %rbx +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: movq %rbx, %rsi +; SSSE3-NEXT: imulq %r13, %rsi +; SSSE3-NEXT: movq %rbx, %rax +; SSSE3-NEXT: mulq %rdi +; SSSE3-NEXT: movq %rax, %r12 +; SSSE3-NEXT: addq %rsi, %rdx +; SSSE3-NEXT: imulq %rdi, %rbx +; SSSE3-NEXT: addq %rdx, %rbx +; SSSE3-NEXT: addq %r10, %r12 +; SSSE3-NEXT: adcq %rcx, %rbx +; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: mulq %r14 +; SSSE3-NEXT: movq %rdx, %rbp +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movq %r13, %rax +; SSSE3-NEXT: mulq %r14 +; SSSE3-NEXT: movq %rdx, %rsi +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: addq %rbp, %rcx +; SSSE3-NEXT: adcq $0, %rsi +; SSSE3-NEXT: movq %rdi, %rax +; SSSE3-NEXT: mulq %r9 +; SSSE3-NEXT: movq %rdx, %rbp +; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: addq %rcx, %rdi +; SSSE3-NEXT: adcq %rsi, %rbp +; SSSE3-NEXT: setb %al +; SSSE3-NEXT: movzbl %al, %ecx +; SSSE3-NEXT: movq %r13, %rax +; SSSE3-NEXT: mulq %r9 +; SSSE3-NEXT: addq %rbp, %rax +; SSSE3-NEXT: adcq %rcx, %rdx +; SSSE3-NEXT: addq %r12, %rax +; SSSE3-NEXT: adcq %rbx, %rdx +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; SSSE3-NEXT: movq %rdi, 8(%r13) +; SSSE3-NEXT: sarq $63, %rdi +; SSSE3-NEXT: xorq %rdi, %rdx +; SSSE3-NEXT: xorq %rax, %rdi +; SSSE3-NEXT: xorl %r12d, %r12d +; SSSE3-NEXT: orq %rdx, %rdi +; SSSE3-NEXT: setne %r12b +; SSSE3-NEXT: movq %r11, %rdi +; SSSE3-NEXT: sarq $63, %rdi ; SSSE3-NEXT: movq %r8, %rax -; SSSE3-NEXT: movq %rcx, %r14 +; SSSE3-NEXT: movq %r8, %rsi +; SSSE3-NEXT: imulq %rdi, %rsi +; SSSE3-NEXT: movq %r8, %rbx +; SSSE3-NEXT: mulq %rdi +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: addq %rsi, %rdx +; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSSE3-NEXT: imulq %r8, %rdi +; SSSE3-NEXT: addq %rdx, %rdi +; SSSE3-NEXT: movq %r8, %rsi +; SSSE3-NEXT: sarq $63, %rsi +; SSSE3-NEXT: movq %rsi, %rbp +; SSSE3-NEXT: imulq %r11, %rbp +; SSSE3-NEXT: movq %rsi, %rax +; SSSE3-NEXT: mulq %r15 +; SSSE3-NEXT: movq %rax, %r14 +; SSSE3-NEXT: addq %rbp, %rdx +; SSSE3-NEXT: imulq %r15, %rsi +; SSSE3-NEXT: addq %rdx, %rsi +; SSSE3-NEXT: addq %rcx, %r14 +; SSSE3-NEXT: adcq %rdi, %rsi +; SSSE3-NEXT: movq %r15, %rax +; SSSE3-NEXT: mulq %rbx +; SSSE3-NEXT: movq %rdx, %rcx +; SSSE3-NEXT: movq %rax, %r9 +; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: mulq %rbx ; SSSE3-NEXT: movq %rdx, %rbx -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSSE3-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: movq %r9, %rcx -; SSSE3-NEXT: callq __muloti4@PLT -; SSSE3-NEXT: movq %rax, %r13 -; SSSE3-NEXT: movq %rdx, %rbp -; SSSE3-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSSE3-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; SSSE3-NEXT: movq %rbx, %rdi -; SSSE3-NEXT: movq %r14, %rsi -; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSSE3-NEXT: movq %r12, %rcx -; SSSE3-NEXT: callq __muloti4@PLT -; SSSE3-NEXT: xorl %ecx, %ecx -; SSSE3-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: sbbl %esi, %esi -; SSSE3-NEXT: movd %esi, %xmm1 -; SSSE3-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT: sbbl %ecx, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: movq %rax, %rbp +; SSSE3-NEXT: addq %rcx, %rbp +; SSSE3-NEXT: adcq $0, %rbx +; SSSE3-NEXT: movq %r15, %rax +; SSSE3-NEXT: mulq %r8 +; SSSE3-NEXT: movq %rdx, %rcx +; SSSE3-NEXT: movq %rax, %rdi +; SSSE3-NEXT: addq %rbp, %rdi +; SSSE3-NEXT: adcq %rbx, %rcx +; SSSE3-NEXT: setb %al +; SSSE3-NEXT: movzbl %al, %ebp +; SSSE3-NEXT: movq %r11, %rax +; SSSE3-NEXT: mulq %r8 +; SSSE3-NEXT: addq %rcx, %rax +; SSSE3-NEXT: adcq %rbp, %rdx +; SSSE3-NEXT: addq %r14, %rax +; SSSE3-NEXT: adcq %rsi, %rdx +; SSSE3-NEXT: movq %rdi, 24(%r13) +; SSSE3-NEXT: sarq $63, %rdi +; SSSE3-NEXT: xorq %rdi, %rdx +; SSSE3-NEXT: xorq %rax, %rdi +; SSSE3-NEXT: xorl %eax, %eax +; SSSE3-NEXT: orq %rdx, %rdi +; SSSE3-NEXT: setne %al +; SSSE3-NEXT: negl %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: negl %r12d +; SSSE3-NEXT: movd %r12d, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %rdx, 24(%r15) -; SSSE3-NEXT: movq %rax, 16(%r15) -; SSSE3-NEXT: movq %rbp, 8(%r15) -; SSSE3-NEXT: movq %r13, (%r15) -; SSSE3-NEXT: addq $24, %rsp +; SSSE3-NEXT: movq %r9, 16(%r13) +; SSSE3-NEXT: movq %r10, (%r13) ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 @@ -3419,38 +3595,126 @@ ; SSE41-NEXT: pushq %r13 ; SSE41-NEXT: pushq %r12 ; SSE41-NEXT: pushq %rbx -; SSE41-NEXT: subq $24, %rsp +; SSE41-NEXT: movq %r8, %r14 +; SSE41-NEXT: movq %rcx, %r11 +; SSE41-NEXT: movq %rdx, %r15 +; SSE41-NEXT: movq %rsi, %r13 +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE41-NEXT: movq %rsi, %rcx +; SSE41-NEXT: sarq $63, %rcx +; SSE41-NEXT: movq %r14, %rsi +; SSE41-NEXT: imulq %rcx, %rsi +; SSE41-NEXT: movq %r14, %rax +; SSE41-NEXT: mulq %rcx +; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: addq %rsi, %rdx +; SSE41-NEXT: imulq %r9, %rcx +; SSE41-NEXT: addq %rdx, %rcx +; SSE41-NEXT: movq %r9, %rbx +; SSE41-NEXT: sarq $63, %rbx +; SSE41-NEXT: movq %rbx, %rsi +; SSE41-NEXT: imulq %r13, %rsi +; SSE41-NEXT: movq %rbx, %rax +; SSE41-NEXT: mulq %rdi +; SSE41-NEXT: movq %rax, %r12 +; SSE41-NEXT: addq %rsi, %rdx +; SSE41-NEXT: imulq %rdi, %rbx +; SSE41-NEXT: addq %rdx, %rbx +; SSE41-NEXT: addq %r10, %r12 +; SSE41-NEXT: adcq %rcx, %rbx +; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: mulq %r14 +; SSE41-NEXT: movq %rdx, %rbp +; SSE41-NEXT: movq %rax, %r10 +; SSE41-NEXT: movq %r13, %rax +; SSE41-NEXT: mulq %r14 +; SSE41-NEXT: movq %rdx, %rsi +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: addq %rbp, %rcx +; SSE41-NEXT: adcq $0, %rsi +; SSE41-NEXT: movq %rdi, %rax +; SSE41-NEXT: mulq %r9 +; SSE41-NEXT: movq %rdx, %rbp +; SSE41-NEXT: movq %rax, %rdi +; SSE41-NEXT: addq %rcx, %rdi +; SSE41-NEXT: adcq %rsi, %rbp +; SSE41-NEXT: setb %al +; SSE41-NEXT: movzbl %al, %ecx +; SSE41-NEXT: movq %r13, %rax +; SSE41-NEXT: mulq %r9 +; SSE41-NEXT: addq %rbp, %rax +; SSE41-NEXT: adcq %rcx, %rdx +; SSE41-NEXT: addq %r12, %rax +; SSE41-NEXT: adcq %rbx, %rdx +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; SSE41-NEXT: movq %rdi, 8(%r13) +; SSE41-NEXT: sarq $63, %rdi +; SSE41-NEXT: xorq %rdi, %rdx +; SSE41-NEXT: xorq %rax, %rdi +; SSE41-NEXT: xorl %r12d, %r12d +; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: setne %r12b +; SSE41-NEXT: movq %r11, %rdi +; SSE41-NEXT: sarq $63, %rdi ; SSE41-NEXT: movq %r8, %rax -; SSE41-NEXT: movq %rcx, %r14 +; SSE41-NEXT: movq %r8, %rsi +; SSE41-NEXT: imulq %rdi, %rsi +; SSE41-NEXT: movq %r8, %rbx +; SSE41-NEXT: mulq %rdi +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: addq %rsi, %rdx +; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE41-NEXT: imulq %r8, %rdi +; SSE41-NEXT: addq %rdx, %rdi +; SSE41-NEXT: movq %r8, %rsi +; SSE41-NEXT: sarq $63, %rsi +; SSE41-NEXT: movq %rsi, %rbp +; SSE41-NEXT: imulq %r11, %rbp +; SSE41-NEXT: movq %rsi, %rax +; SSE41-NEXT: mulq %r15 +; SSE41-NEXT: movq %rax, %r14 +; SSE41-NEXT: addq %rbp, %rdx +; SSE41-NEXT: imulq %r15, %rsi +; SSE41-NEXT: addq %rdx, %rsi +; SSE41-NEXT: addq %rcx, %r14 +; SSE41-NEXT: adcq %rdi, %rsi +; SSE41-NEXT: movq %r15, %rax +; SSE41-NEXT: mulq %rbx +; SSE41-NEXT: movq %rdx, %rcx +; SSE41-NEXT: movq %rax, %r9 +; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: mulq %rbx ; SSE41-NEXT: movq %rdx, %rbx -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE41-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: movq %r9, %rcx -; SSE41-NEXT: callq __muloti4@PLT -; SSE41-NEXT: movq %rax, %r13 -; SSE41-NEXT: movq %rdx, %rbp -; SSE41-NEXT: movq $0, {{[0-9]+}}(%rsp) -; SSE41-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; SSE41-NEXT: movq %rbx, %rdi -; SSE41-NEXT: movq %r14, %rsi -; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE41-NEXT: movq %r12, %rcx -; SSE41-NEXT: callq __muloti4@PLT -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: sbbl %esi, %esi -; SSE41-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT: sbbl %ecx, %ecx -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrd $1, %esi, %xmm0 -; SSE41-NEXT: movq %rdx, 24(%r15) -; SSE41-NEXT: movq %rax, 16(%r15) -; SSE41-NEXT: movq %rbp, 8(%r15) -; SSE41-NEXT: movq %r13, (%r15) -; SSE41-NEXT: addq $24, %rsp +; SSE41-NEXT: movq %rax, %rbp +; SSE41-NEXT: addq %rcx, %rbp +; SSE41-NEXT: adcq $0, %rbx +; SSE41-NEXT: movq %r15, %rax +; SSE41-NEXT: mulq %r8 +; SSE41-NEXT: movq %rdx, %rcx +; SSE41-NEXT: movq %rax, %rdi +; SSE41-NEXT: addq %rbp, %rdi +; SSE41-NEXT: adcq %rbx, %rcx +; SSE41-NEXT: setb %al +; SSE41-NEXT: movzbl %al, %ebp +; SSE41-NEXT: movq %r11, %rax +; SSE41-NEXT: mulq %r8 +; SSE41-NEXT: addq %rcx, %rax +; SSE41-NEXT: adcq %rbp, %rdx +; SSE41-NEXT: addq %r14, %rax +; SSE41-NEXT: adcq %rsi, %rdx +; SSE41-NEXT: movq %rdi, 24(%r13) +; SSE41-NEXT: sarq $63, %rdi +; SSE41-NEXT: xorq %rdi, %rdx +; SSE41-NEXT: xorq %rax, %rdi +; SSE41-NEXT: xorl %eax, %eax +; SSE41-NEXT: orq %rdx, %rdi +; SSE41-NEXT: setne %al +; SSE41-NEXT: negl %eax +; SSE41-NEXT: negl %r12d +; SSE41-NEXT: movd %r12d, %xmm0 +; SSE41-NEXT: pinsrd $1, %eax, %xmm0 +; SSE41-NEXT: movq %r9, 16(%r13) +; SSE41-NEXT: movq %r10, (%r13) ; SSE41-NEXT: popq %rbx ; SSE41-NEXT: popq %r12 ; SSE41-NEXT: popq %r13 @@ -3467,38 +3731,126 @@ ; AVX-NEXT: pushq %r13 ; AVX-NEXT: pushq %r12 ; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $24, %rsp +; AVX-NEXT: movq %r8, %r14 +; AVX-NEXT: movq %rcx, %r11 +; AVX-NEXT: movq %rdx, %r15 +; AVX-NEXT: movq %rsi, %r13 +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX-NEXT: movq %rsi, %rcx +; AVX-NEXT: sarq $63, %rcx +; AVX-NEXT: movq %r14, %rsi +; AVX-NEXT: imulq %rcx, %rsi +; AVX-NEXT: movq %r14, %rax +; AVX-NEXT: mulq %rcx +; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: addq %rsi, %rdx +; AVX-NEXT: imulq %r9, %rcx +; AVX-NEXT: addq %rdx, %rcx +; AVX-NEXT: movq %r9, %rbx +; AVX-NEXT: sarq $63, %rbx +; AVX-NEXT: movq %rbx, %rsi +; AVX-NEXT: imulq %r13, %rsi +; AVX-NEXT: movq %rbx, %rax +; AVX-NEXT: mulq %rdi +; AVX-NEXT: movq %rax, %r12 +; AVX-NEXT: addq %rsi, %rdx +; AVX-NEXT: imulq %rdi, %rbx +; AVX-NEXT: addq %rdx, %rbx +; AVX-NEXT: addq %r10, %r12 +; AVX-NEXT: adcq %rcx, %rbx +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: mulq %r14 +; AVX-NEXT: movq %rdx, %rbp +; AVX-NEXT: movq %rax, %r10 +; AVX-NEXT: movq %r13, %rax +; AVX-NEXT: mulq %r14 +; AVX-NEXT: movq %rdx, %rsi +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: addq %rbp, %rcx +; AVX-NEXT: adcq $0, %rsi +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: mulq %r9 +; AVX-NEXT: movq %rdx, %rbp +; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: addq %rcx, %rdi +; AVX-NEXT: adcq %rsi, %rbp +; AVX-NEXT: setb %al +; AVX-NEXT: movzbl %al, %ecx +; AVX-NEXT: movq %r13, %rax +; AVX-NEXT: mulq %r9 +; AVX-NEXT: addq %rbp, %rax +; AVX-NEXT: adcq %rcx, %rdx +; AVX-NEXT: addq %r12, %rax +; AVX-NEXT: adcq %rbx, %rdx +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX-NEXT: movq %rdi, 8(%r13) +; AVX-NEXT: sarq $63, %rdi +; AVX-NEXT: xorq %rdi, %rdx +; AVX-NEXT: xorq %rax, %rdi +; AVX-NEXT: xorl %r12d, %r12d +; AVX-NEXT: orq %rdx, %rdi +; AVX-NEXT: setne %r12b +; AVX-NEXT: movq %r11, %rdi +; AVX-NEXT: sarq $63, %rdi ; AVX-NEXT: movq %r8, %rax -; AVX-NEXT: movq %rcx, %r14 +; AVX-NEXT: movq %r8, %rsi +; AVX-NEXT: imulq %rdi, %rsi +; AVX-NEXT: movq %r8, %rbx +; AVX-NEXT: mulq %rdi +; AVX-NEXT: movq %rax, %rcx +; AVX-NEXT: addq %rsi, %rdx +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX-NEXT: imulq %r8, %rdi +; AVX-NEXT: addq %rdx, %rdi +; AVX-NEXT: movq %r8, %rsi +; AVX-NEXT: sarq $63, %rsi +; AVX-NEXT: movq %rsi, %rbp +; AVX-NEXT: imulq %r11, %rbp +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: mulq %r15 +; AVX-NEXT: movq %rax, %r14 +; AVX-NEXT: addq %rbp, %rdx +; AVX-NEXT: imulq %r15, %rsi +; AVX-NEXT: addq %rdx, %rsi +; AVX-NEXT: addq %rcx, %r14 +; AVX-NEXT: adcq %rdi, %rsi +; AVX-NEXT: movq %r15, %rax +; AVX-NEXT: mulq %rbx +; AVX-NEXT: movq %rdx, %rcx +; AVX-NEXT: movq %rax, %r9 +; AVX-NEXT: movq %r11, %rax +; AVX-NEXT: mulq %rbx ; AVX-NEXT: movq %rdx, %rbx -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: movq %rax, %rdx -; AVX-NEXT: movq %r9, %rcx -; AVX-NEXT: callq __muloti4@PLT -; AVX-NEXT: movq %rax, %r13 -; AVX-NEXT: movq %rdx, %rbp -; AVX-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX-NEXT: movq %rbx, %rdi -; AVX-NEXT: movq %r14, %rsi -; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX-NEXT: movq %r12, %rcx -; AVX-NEXT: callq __muloti4@PLT -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: sbbl %esi, %esi -; AVX-NEXT: cmpq {{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: sbbl %ecx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0 -; AVX-NEXT: movq %rdx, 24(%r15) -; AVX-NEXT: movq %rax, 16(%r15) -; AVX-NEXT: movq %rbp, 8(%r15) -; AVX-NEXT: movq %r13, (%r15) -; AVX-NEXT: addq $24, %rsp +; AVX-NEXT: movq %rax, %rbp +; AVX-NEXT: addq %rcx, %rbp +; AVX-NEXT: adcq $0, %rbx +; AVX-NEXT: movq %r15, %rax +; AVX-NEXT: mulq %r8 +; AVX-NEXT: movq %rdx, %rcx +; AVX-NEXT: movq %rax, %rdi +; AVX-NEXT: addq %rbp, %rdi +; AVX-NEXT: adcq %rbx, %rcx +; AVX-NEXT: setb %al +; AVX-NEXT: movzbl %al, %ebp +; AVX-NEXT: movq %r11, %rax +; AVX-NEXT: mulq %r8 +; AVX-NEXT: addq %rcx, %rax +; AVX-NEXT: adcq %rbp, %rdx +; AVX-NEXT: addq %r14, %rax +; AVX-NEXT: adcq %rsi, %rdx +; AVX-NEXT: movq %rdi, 24(%r13) +; AVX-NEXT: sarq $63, %rdi +; AVX-NEXT: xorq %rdi, %rdx +; AVX-NEXT: xorq %rax, %rdi +; AVX-NEXT: xorl %eax, %eax +; AVX-NEXT: orq %rdx, %rdi +; AVX-NEXT: setne %al +; AVX-NEXT: negl %eax +; AVX-NEXT: negl %r12d +; AVX-NEXT: vmovd %r12d, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: movq %r9, 16(%r13) +; AVX-NEXT: movq %r10, (%r13) ; AVX-NEXT: popq %rbx ; AVX-NEXT: popq %r12 ; AVX-NEXT: popq %r13 @@ -3515,42 +3867,129 @@ ; AVX512F-NEXT: pushq %r13 ; AVX512F-NEXT: pushq %r12 ; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: subq $24, %rsp -; AVX512F-NEXT: movq %r8, %rax +; AVX512F-NEXT: movq %r9, %r10 +; AVX512F-NEXT: movq %r8, %r9 ; AVX512F-NEXT: movq %rcx, %r14 -; AVX512F-NEXT: movq %rdx, %rbx -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX512F-NEXT: movq %rdx, %rcx +; AVX512F-NEXT: movq %rsi, %r11 +; AVX512F-NEXT: movq %rdi, %r15 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512F-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX512F-NEXT: movq %rax, %rdx -; AVX512F-NEXT: movq %r9, %rcx -; AVX512F-NEXT: callq __muloti4@PLT +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX512F-NEXT: movq %r14, %rdi +; AVX512F-NEXT: sarq $63, %rdi +; AVX512F-NEXT: movq %r12, %rbx +; AVX512F-NEXT: imulq %rdi, %rbx +; AVX512F-NEXT: movq %r12, %rax +; AVX512F-NEXT: mulq %rdi +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: addq %rbx, %rdx +; AVX512F-NEXT: imulq %r8, %rdi +; AVX512F-NEXT: addq %rdx, %rdi +; AVX512F-NEXT: movq %r8, %rbx +; AVX512F-NEXT: sarq $63, %rbx +; AVX512F-NEXT: movq %rbx, %rbp +; AVX512F-NEXT: imulq %r14, %rbp +; AVX512F-NEXT: movq %rbx, %rax +; AVX512F-NEXT: mulq %rcx ; AVX512F-NEXT: movq %rax, %r13 +; AVX512F-NEXT: addq %rbp, %rdx +; AVX512F-NEXT: imulq %rcx, %rbx +; AVX512F-NEXT: addq %rdx, %rbx +; AVX512F-NEXT: addq %rsi, %r13 +; AVX512F-NEXT: adcq %rdi, %rbx +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: mulq %r12 ; AVX512F-NEXT: movq %rdx, %rbp -; AVX512F-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX512F-NEXT: movq %rbx, %rdi -; AVX512F-NEXT: movq %r14, %rsi -; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512F-NEXT: movq %r12, %rcx -; AVX512F-NEXT: callq __muloti4@PLT -; AVX512F-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: setne %cl -; AVX512F-NEXT: kmovw %ecx, %k0 -; AVX512F-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: setne %cl -; AVX512F-NEXT: andl $1, %ecx -; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512F-NEXT: movq %r14, %rax +; AVX512F-NEXT: mulq %r12 +; AVX512F-NEXT: movq %rdx, %rdi +; AVX512F-NEXT: movq %rax, %rsi +; AVX512F-NEXT: addq %rbp, %rsi +; AVX512F-NEXT: adcq $0, %rdi +; AVX512F-NEXT: movq %rcx, %rax +; AVX512F-NEXT: mulq %r8 +; AVX512F-NEXT: movq %rdx, %rbp +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: addq %rsi, %rcx +; AVX512F-NEXT: adcq %rdi, %rbp +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: movzbl %al, %esi +; AVX512F-NEXT: movq %r14, %rax +; AVX512F-NEXT: mulq %r8 +; AVX512F-NEXT: addq %rbp, %rax +; AVX512F-NEXT: adcq %rsi, %rdx +; AVX512F-NEXT: addq %r13, %rax +; AVX512F-NEXT: adcq %rbx, %rdx +; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX512F-NEXT: movq %rcx, 24(%r8) +; AVX512F-NEXT: sarq $63, %rcx +; AVX512F-NEXT: xorq %rcx, %rdx +; AVX512F-NEXT: xorq %rax, %rcx +; AVX512F-NEXT: orq %rdx, %rcx +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: kmovw %eax, %k0 +; AVX512F-NEXT: movq %r11, %rdi +; AVX512F-NEXT: sarq $63, %rdi +; AVX512F-NEXT: movq %r9, %rsi +; AVX512F-NEXT: imulq %rdi, %rsi +; AVX512F-NEXT: movq %r9, %rax +; AVX512F-NEXT: mulq %rdi +; AVX512F-NEXT: movq %rax, %rcx +; AVX512F-NEXT: addq %rsi, %rdx +; AVX512F-NEXT: imulq %r10, %rdi +; AVX512F-NEXT: addq %rdx, %rdi +; AVX512F-NEXT: movq %r10, %rsi +; AVX512F-NEXT: sarq $63, %rsi +; AVX512F-NEXT: movq %rsi, %rbp +; AVX512F-NEXT: imulq %r11, %rbp +; AVX512F-NEXT: movq %rsi, %rax +; AVX512F-NEXT: mulq %r15 +; AVX512F-NEXT: movq %rax, %r12 +; AVX512F-NEXT: addq %rbp, %rdx +; AVX512F-NEXT: imulq %r15, %rsi +; AVX512F-NEXT: addq %rdx, %rsi +; AVX512F-NEXT: addq %rcx, %r12 +; AVX512F-NEXT: adcq %rdi, %rsi +; AVX512F-NEXT: movq %r15, %rax +; AVX512F-NEXT: mulq %r9 +; AVX512F-NEXT: movq %rdx, %rcx +; AVX512F-NEXT: movq %rax, %r14 +; AVX512F-NEXT: movq %r11, %rax +; AVX512F-NEXT: mulq %r9 +; AVX512F-NEXT: movq %rdx, %rbp +; AVX512F-NEXT: movq %rax, %rbx +; AVX512F-NEXT: addq %rcx, %rbx +; AVX512F-NEXT: adcq $0, %rbp +; AVX512F-NEXT: movq %r15, %rax +; AVX512F-NEXT: mulq %r10 +; AVX512F-NEXT: movq %rdx, %rcx +; AVX512F-NEXT: movq %rax, %rdi +; AVX512F-NEXT: addq %rbx, %rdi +; AVX512F-NEXT: adcq %rbp, %rcx +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: movzbl %al, %ebp +; AVX512F-NEXT: movq %r11, %rax +; AVX512F-NEXT: mulq %r10 +; AVX512F-NEXT: addq %rcx, %rax +; AVX512F-NEXT: adcq %rbp, %rdx +; AVX512F-NEXT: addq %r12, %rax +; AVX512F-NEXT: adcq %rsi, %rdx +; AVX512F-NEXT: movq %rdi, 8(%r8) +; AVX512F-NEXT: sarq $63, %rdi +; AVX512F-NEXT: xorq %rdi, %rdx +; AVX512F-NEXT: xorq %rax, %rdi +; AVX512F-NEXT: orq %rdx, %rdi +; AVX512F-NEXT: setne %al +; AVX512F-NEXT: andl $1, %eax +; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $1, %k0, %k0 ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512F-NEXT: movq %rdx, 24(%r15) -; AVX512F-NEXT: movq %rax, 16(%r15) -; AVX512F-NEXT: movq %rbp, 8(%r15) -; AVX512F-NEXT: movq %r13, (%r15) -; AVX512F-NEXT: addq $24, %rsp +; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512F-NEXT: movq %rax, 16(%r8) +; AVX512F-NEXT: movq %r14, (%r8) ; AVX512F-NEXT: popq %rbx ; AVX512F-NEXT: popq %r12 ; AVX512F-NEXT: popq %r13 @@ -3567,42 +4006,129 @@ ; AVX512BW-NEXT: pushq %r13 ; AVX512BW-NEXT: pushq %r12 ; AVX512BW-NEXT: pushq %rbx -; AVX512BW-NEXT: subq $24, %rsp -; AVX512BW-NEXT: movq %r8, %rax +; AVX512BW-NEXT: movq %r9, %r10 +; AVX512BW-NEXT: movq %r8, %r9 ; AVX512BW-NEXT: movq %rcx, %r14 -; AVX512BW-NEXT: movq %rdx, %rbx -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: movq %rsi, %r11 +; AVX512BW-NEXT: movq %rdi, %r15 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; AVX512BW-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX512BW-NEXT: movq %rax, %rdx -; AVX512BW-NEXT: movq %r9, %rcx -; AVX512BW-NEXT: callq __muloti4@PLT +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX512BW-NEXT: movq %r14, %rdi +; AVX512BW-NEXT: sarq $63, %rdi +; AVX512BW-NEXT: movq %r12, %rbx +; AVX512BW-NEXT: imulq %rdi, %rbx +; AVX512BW-NEXT: movq %r12, %rax +; AVX512BW-NEXT: mulq %rdi +; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: addq %rbx, %rdx +; AVX512BW-NEXT: imulq %r8, %rdi +; AVX512BW-NEXT: addq %rdx, %rdi +; AVX512BW-NEXT: movq %r8, %rbx +; AVX512BW-NEXT: sarq $63, %rbx +; AVX512BW-NEXT: movq %rbx, %rbp +; AVX512BW-NEXT: imulq %r14, %rbp +; AVX512BW-NEXT: movq %rbx, %rax +; AVX512BW-NEXT: mulq %rcx ; AVX512BW-NEXT: movq %rax, %r13 +; AVX512BW-NEXT: addq %rbp, %rdx +; AVX512BW-NEXT: imulq %rcx, %rbx +; AVX512BW-NEXT: addq %rdx, %rbx +; AVX512BW-NEXT: addq %rsi, %r13 +; AVX512BW-NEXT: adcq %rdi, %rbx +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: mulq %r12 ; AVX512BW-NEXT: movq %rdx, %rbp -; AVX512BW-NEXT: movq $0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: leaq {{[0-9]+}}(%rsp), %r8 -; AVX512BW-NEXT: movq %rbx, %rdi -; AVX512BW-NEXT: movq %r14, %rsi -; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX512BW-NEXT: movq %r12, %rcx -; AVX512BW-NEXT: callq __muloti4@PLT -; AVX512BW-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: setne %cl -; AVX512BW-NEXT: kmovd %ecx, %k0 -; AVX512BW-NEXT: cmpq $0, {{[0-9]+}}(%rsp) -; AVX512BW-NEXT: setne %cl -; AVX512BW-NEXT: andl $1, %ecx -; AVX512BW-NEXT: kmovw %ecx, %k1 +; AVX512BW-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: mulq %r12 +; AVX512BW-NEXT: movq %rdx, %rdi +; AVX512BW-NEXT: movq %rax, %rsi +; AVX512BW-NEXT: addq %rbp, %rsi +; AVX512BW-NEXT: adcq $0, %rdi +; AVX512BW-NEXT: movq %rcx, %rax +; AVX512BW-NEXT: mulq %r8 +; AVX512BW-NEXT: movq %rdx, %rbp +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: addq %rsi, %rcx +; AVX512BW-NEXT: adcq %rdi, %rbp +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: movzbl %al, %esi +; AVX512BW-NEXT: movq %r14, %rax +; AVX512BW-NEXT: mulq %r8 +; AVX512BW-NEXT: addq %rbp, %rax +; AVX512BW-NEXT: adcq %rsi, %rdx +; AVX512BW-NEXT: addq %r13, %rax +; AVX512BW-NEXT: adcq %rbx, %rdx +; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX512BW-NEXT: movq %rcx, 24(%r8) +; AVX512BW-NEXT: sarq $63, %rcx +; AVX512BW-NEXT: xorq %rcx, %rdx +; AVX512BW-NEXT: xorq %rax, %rcx +; AVX512BW-NEXT: orq %rdx, %rcx +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: kmovd %eax, %k0 +; AVX512BW-NEXT: movq %r11, %rdi +; AVX512BW-NEXT: sarq $63, %rdi +; AVX512BW-NEXT: movq %r9, %rsi +; AVX512BW-NEXT: imulq %rdi, %rsi +; AVX512BW-NEXT: movq %r9, %rax +; AVX512BW-NEXT: mulq %rdi +; AVX512BW-NEXT: movq %rax, %rcx +; AVX512BW-NEXT: addq %rsi, %rdx +; AVX512BW-NEXT: imulq %r10, %rdi +; AVX512BW-NEXT: addq %rdx, %rdi +; AVX512BW-NEXT: movq %r10, %rsi +; AVX512BW-NEXT: sarq $63, %rsi +; AVX512BW-NEXT: movq %rsi, %rbp +; AVX512BW-NEXT: imulq %r11, %rbp +; AVX512BW-NEXT: movq %rsi, %rax +; AVX512BW-NEXT: mulq %r15 +; AVX512BW-NEXT: movq %rax, %r12 +; AVX512BW-NEXT: addq %rbp, %rdx +; AVX512BW-NEXT: imulq %r15, %rsi +; AVX512BW-NEXT: addq %rdx, %rsi +; AVX512BW-NEXT: addq %rcx, %r12 +; AVX512BW-NEXT: adcq %rdi, %rsi +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: mulq %r9 +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: movq %rax, %r14 +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: mulq %r9 +; AVX512BW-NEXT: movq %rdx, %rbp +; AVX512BW-NEXT: movq %rax, %rbx +; AVX512BW-NEXT: addq %rcx, %rbx +; AVX512BW-NEXT: adcq $0, %rbp +; AVX512BW-NEXT: movq %r15, %rax +; AVX512BW-NEXT: mulq %r10 +; AVX512BW-NEXT: movq %rdx, %rcx +; AVX512BW-NEXT: movq %rax, %rdi +; AVX512BW-NEXT: addq %rbx, %rdi +; AVX512BW-NEXT: adcq %rbp, %rcx +; AVX512BW-NEXT: setb %al +; AVX512BW-NEXT: movzbl %al, %ebp +; AVX512BW-NEXT: movq %r11, %rax +; AVX512BW-NEXT: mulq %r10 +; AVX512BW-NEXT: addq %rcx, %rax +; AVX512BW-NEXT: adcq %rbp, %rdx +; AVX512BW-NEXT: addq %r12, %rax +; AVX512BW-NEXT: adcq %rsi, %rdx +; AVX512BW-NEXT: movq %rdi, 8(%r8) +; AVX512BW-NEXT: sarq $63, %rdi +; AVX512BW-NEXT: xorq %rdi, %rdx +; AVX512BW-NEXT: xorq %rax, %rdi +; AVX512BW-NEXT: orq %rdx, %rdi +; AVX512BW-NEXT: setne %al +; AVX512BW-NEXT: andl $1, %eax +; AVX512BW-NEXT: kmovw %eax, %k1 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512BW-NEXT: korw %k0, %k1, %k1 ; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512BW-NEXT: movq %rdx, 24(%r15) -; AVX512BW-NEXT: movq %rax, 16(%r15) -; AVX512BW-NEXT: movq %rbp, 8(%r15) -; AVX512BW-NEXT: movq %r13, (%r15) -; AVX512BW-NEXT: addq $24, %rsp +; AVX512BW-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512BW-NEXT: movq %rax, 16(%r8) +; AVX512BW-NEXT: movq %r14, (%r8) ; AVX512BW-NEXT: popq %rbx ; AVX512BW-NEXT: popq %r12 ; AVX512BW-NEXT: popq %r13