diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1969,11 +1969,17 @@ if (Subtarget.useAVX512Regs()) { setGroup(MVT::v32f16); addRegisterClass(MVT::v32f16, &X86::VR512RegClass); + setOperationAction(ISD::FROUND, MVT::v32f16, Custom); + setOperationAction(ISD::FROUNDEVEN, MVT::v32f16, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::v32f16, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal); + setOperationAction(ISD::FROUND, MVT::v16f16, Custom); + setOperationAction(ISD::FROUNDEVEN, MVT::v16f16, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::v16f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); @@ -2027,6 +2033,9 @@ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); + setOperationAction(ISD::FROUND, MVT::v8f16, Custom); + setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::v8f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -146,6 +146,444 @@ ret double %a } +define <8 x half> @roundeven_v8f16(<8 x half> %x) { +; SSE2-LABEL: roundeven_v8f16: +; SSE2: ## %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: .cfi_def_cfa_offset 40 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: .cfi_def_cfa_offset 48 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 56 +; SSE2-NEXT: subq $24, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 80 +; SSE2-NEXT: .cfi_offset %rbx, -56 +; SSE2-NEXT: .cfi_offset %r12, -48 +; SSE2-NEXT: .cfi_offset %r13, -40 +; SSE2-NEXT: .cfi_offset %r14, -32 +; SSE2-NEXT: .cfi_offset %r15, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movl %r9d, %r15d +; SSE2-NEXT: movl %r8d, %r12d +; SSE2-NEXT: movl %ecx, %r14d +; SSE2-NEXT: movl %edx, %ebp +; SSE2-NEXT: movq %rdi, %rbx +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r13d +; SSE2-NEXT: movzwl %si, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movzwl %bp, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movzwl %r14w, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: movzwl %r12w, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: movzwl %r15w, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: movl %r13d, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, 14(%rbx) +; SSE2-NEXT: movw %bp, 12(%rbx) +; SSE2-NEXT: movw %r13w, 10(%rbx) +; SSE2-NEXT: movw %r15w, 8(%rbx) +; SSE2-NEXT: movw %r12w, 6(%rbx) +; SSE2-NEXT: movw %r14w, 4(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 2(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, (%rbx) +; SSE2-NEXT: movq %rbx, %rax +; SSE2-NEXT: addq $24, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSE41-LABEL: roundeven_v8f16: +; SSE41: ## %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: .cfi_def_cfa_offset 16 +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: .cfi_def_cfa_offset 24 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: .cfi_def_cfa_offset 32 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: .cfi_def_cfa_offset 40 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: .cfi_def_cfa_offset 48 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: .cfi_def_cfa_offset 56 +; SSE41-NEXT: subq $24, %rsp +; SSE41-NEXT: .cfi_def_cfa_offset 80 +; SSE41-NEXT: .cfi_offset %rbx, -56 +; SSE41-NEXT: .cfi_offset %r12, -48 +; SSE41-NEXT: .cfi_offset %r13, -40 +; SSE41-NEXT: .cfi_offset %r14, -32 +; SSE41-NEXT: .cfi_offset %r15, -24 +; SSE41-NEXT: .cfi_offset %rbp, -16 +; SSE41-NEXT: movl %r9d, %r15d +; SSE41-NEXT: movl %r8d, %r12d +; SSE41-NEXT: movl %ecx, %r14d +; SSE41-NEXT: movl %edx, %ebp +; SSE41-NEXT: movq %rdi, %rbx +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %r13d +; SSE41-NEXT: movzwl %si, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movzwl %bp, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movzwl %r14w, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: movzwl %r12w, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: movzwl %r15w, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: movl %r13d, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, 14(%rbx) +; SSE41-NEXT: movw %bp, 12(%rbx) +; SSE41-NEXT: movw %r13w, 10(%rbx) +; SSE41-NEXT: movw %r15w, 8(%rbx) +; SSE41-NEXT: movw %r12w, 6(%rbx) +; SSE41-NEXT: movw %r14w, 4(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 2(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, (%rbx) +; SSE41-NEXT: movq %rbx, %rax +; SSE41-NEXT: addq $24, %rsp +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: roundeven_v8f16: +; AVX1: ## %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: .cfi_def_cfa_offset 24 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: .cfi_def_cfa_offset 40 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: .cfi_def_cfa_offset 56 +; AVX1-NEXT: subq $40, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 96 +; AVX1-NEXT: .cfi_offset %rbx, -56 +; AVX1-NEXT: .cfi_offset %r12, -48 +; AVX1-NEXT: .cfi_offset %r13, -40 +; AVX1-NEXT: .cfi_offset %r14, -32 +; AVX1-NEXT: .cfi_offset %r15, -24 +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movl %r9d, %r15d +; AVX1-NEXT: movl %r8d, %r12d +; AVX1-NEXT: movl %ecx, %r14d +; AVX1-NEXT: movl %edx, %ebp +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %r13d +; AVX1-NEXT: movzwl %si, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movzwl %bp, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movzwl %r14w, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: movzwl %r12w, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: movzwl %r15w, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: movl %r13d, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, 14(%rbx) +; AVX1-NEXT: movw %bp, 12(%rbx) +; AVX1-NEXT: movw %r13w, 10(%rbx) +; AVX1-NEXT: movw %r15w, 8(%rbx) +; AVX1-NEXT: movw %r12w, 6(%rbx) +; AVX1-NEXT: movw %r14w, 4(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 2(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, (%rbx) +; AVX1-NEXT: movq %rbx, %rax +; AVX1-NEXT: addq $40, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX512F-LABEL: roundeven_v8f16: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; AVX512F-NEXT: movzwl %si, %esi +; AVX512F-NEXT: vmovd %esi, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm9 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512F-NEXT: vmovdqa %xmm9, %xmm1 +; AVX512F-NEXT: vpternlogd $248, %xmm8, %xmm0, %xmm1 +; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: movzwl %dx, %edx +; AVX512F-NEXT: vmovd %edx, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa %xmm9, %xmm2 +; AVX512F-NEXT: vpternlogd $248, %xmm8, %xmm1, %xmm2 +; AVX512F-NEXT: vaddss %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vroundss $11, %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512F-NEXT: movzwl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm9, %xmm3 +; AVX512F-NEXT: vpternlogd $248, %xmm8, %xmm2, %xmm3 +; AVX512F-NEXT: vaddss %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vroundss $11, %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512F-NEXT: movzwl %r8w, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm3 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: vmovdqa %xmm9, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm8, %xmm3, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm3, %xmm3 +; AVX512F-NEXT: vroundss $11, %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512F-NEXT: movzwl %r9w, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm9, %xmm7 +; AVX512F-NEXT: vpternlogd $248, %xmm8, %xmm6, %xmm7 +; AVX512F-NEXT: vaddss %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vroundss $11, %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vcvtps2ph $4, %xmm6, %xmm6 +; AVX512F-NEXT: vmovd %edi, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512F-NEXT: vmovdqa %xmm9, %xmm5 +; AVX512F-NEXT: vpternlogd $248, %xmm8, %xmm7, %xmm5 +; AVX512F-NEXT: vaddss %xmm5, %xmm7, %xmm5 +; AVX512F-NEXT: vroundss $11, %xmm5, %xmm5, %xmm5 +; AVX512F-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; AVX512F-NEXT: vmovd %r11d, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512F-NEXT: vmovdqa %xmm9, %xmm4 +; AVX512F-NEXT: vpternlogd $248, %xmm8, %xmm7, %xmm4 +; AVX512F-NEXT: vaddss %xmm4, %xmm7, %xmm4 +; AVX512F-NEXT: vroundss $11, %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512F-NEXT: vmovd %r10d, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512F-NEXT: vpternlogd $248, %xmm8, %xmm7, %xmm9 +; AVX512F-NEXT: vaddss %xmm7, %xmm9, %xmm7 +; AVX512F-NEXT: vroundss $11, %xmm7, %xmm7, %xmm7 +; AVX512F-NEXT: vcvtps2ph $4, %xmm7, %xmm7 +; AVX512F-NEXT: vpextrw $0, %xmm7, 14(%rax) +; AVX512F-NEXT: vpextrw $0, %xmm4, 12(%rax) +; AVX512F-NEXT: vpextrw $0, %xmm5, 10(%rax) +; AVX512F-NEXT: vpextrw $0, %xmm6, 8(%rax) +; AVX512F-NEXT: vpextrw $0, %xmm3, 6(%rax) +; AVX512F-NEXT: vpextrw $0, %xmm2, 4(%rax) +; AVX512F-NEXT: vpextrw $0, %xmm1, 2(%rax) +; AVX512F-NEXT: vpextrw $0, %xmm0, (%rax) +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: roundeven_v8f16: +; AVX512FP16: ## %bb.0: +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] +; AVX512FP16-NEXT: vpternlogq $248, %xmm1, %xmm0, %xmm2 +; AVX512FP16-NEXT: vaddph %xmm2, %xmm0, %xmm0 +; AVX512FP16-NEXT: vrndscaleph $11, %xmm0, %xmm0 +; AVX512FP16-NEXT: retq + %a = call <8 x half> @llvm.round.v8f16(<8 x half> %x) + ret <8 x half> %a +} + define <4 x float> @round_v4f32(<4 x float> %x) { ; SSE2-LABEL: round_v4f32: ; SSE2: ## %bb.0: @@ -248,141 +686,939 @@ ret <2 x double> %a } -define <8 x float> @round_v8f32(<8 x float> %x) { -; SSE2-LABEL: round_v8f32: +define <16 x half> @roundeven_v16f16(<16 x half> %x) { +; SSE2-LABEL: roundeven_v16f16: ; SSE2: ## %bb.0: -; SSE2-NEXT: subq $72, %rsp -; SSE2-NEXT: .cfi_def_cfa_offset 80 -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: .cfi_def_cfa_offset 40 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: .cfi_def_cfa_offset 48 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 56 +; SSE2-NEXT: subq $56, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 112 +; SSE2-NEXT: .cfi_offset %rbx, -56 +; SSE2-NEXT: .cfi_offset %r12, -48 +; SSE2-NEXT: .cfi_offset %r13, -40 +; SSE2-NEXT: .cfi_offset %r14, -32 +; SSE2-NEXT: .cfi_offset %r15, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movl %r9d, %ebp +; SSE2-NEXT: movl %r8d, %r15d +; SSE2-NEXT: movl %ecx, %r14d +; SSE2-NEXT: movl %edx, %r13d +; SSE2-NEXT: movq %rdi, %rbx +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d +; SSE2-NEXT: movzwl %si, %edi +; SSE2-NEXT: callq ___extendhfsf2 ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movzwl %r13w, %edi +; SSE2-NEXT: callq ___extendhfsf2 ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movzwl %r14w, %edi +; SSE2-NEXT: callq ___extendhfsf2 ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movzwl %r15w, %edi +; SSE2-NEXT: callq ___extendhfsf2 ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] -; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movzwl %bp, %edi +; SSE2-NEXT: callq ___extendhfsf2 ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl %r12d, %edi +; SSE2-NEXT: callq ___extendhfsf2 ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 ; SSE2-NEXT: callq _roundf -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload -; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: addq $72, %rsp +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, 30(%rbx) +; SSE2-NEXT: movw %r12w, 28(%rbx) +; SSE2-NEXT: movw %r15w, 26(%rbx) +; SSE2-NEXT: movw %r14w, 24(%rbx) +; SSE2-NEXT: movw %r13w, 22(%rbx) +; SSE2-NEXT: movw %bp, 20(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 18(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 16(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 14(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 12(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 10(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 8(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 6(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 4(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 2(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, (%rbx) +; SSE2-NEXT: movq %rbx, %rax +; SSE2-NEXT: addq $56, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; SSE41-LABEL: round_v8f32: +; SSE41-LABEL: roundeven_v16f16: ; SSE41: ## %bb.0: -; SSE41-NEXT: movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; SSE41-NEXT: movaps %xmm0, %xmm3 -; SSE41-NEXT: andps %xmm2, %xmm3 -; SSE41-NEXT: movaps {{.*#+}} xmm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] -; SSE41-NEXT: orps %xmm4, %xmm3 -; SSE41-NEXT: addps %xmm0, %xmm3 -; SSE41-NEXT: roundps $11, %xmm3, %xmm0 -; SSE41-NEXT: andps %xmm1, %xmm2 -; SSE41-NEXT: orps %xmm4, %xmm2 -; SSE41-NEXT: addps %xmm1, %xmm2 -; SSE41-NEXT: roundps $11, %xmm2, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: round_v8f32: -; AVX1: ## %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: round_v8f32: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] -; AVX512-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 -; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vroundps $11, %ymm0, %ymm0 -; AVX512-NEXT: retq - %a = call <8 x float> @llvm.round.v8f32(<8 x float> %x) - ret <8 x float> %a -} - -define <4 x double> @round_v4f64(<4 x double> %x) { -; SSE2-LABEL: round_v4f64: -; SSE2: ## %bb.0: -; SSE2-NEXT: subq $56, %rsp -; SSE2-NEXT: .cfi_def_cfa_offset 64 +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: .cfi_def_cfa_offset 16 +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: .cfi_def_cfa_offset 24 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: .cfi_def_cfa_offset 32 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: .cfi_def_cfa_offset 40 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: .cfi_def_cfa_offset 48 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: .cfi_def_cfa_offset 56 +; SSE41-NEXT: subq $56, %rsp +; SSE41-NEXT: .cfi_def_cfa_offset 112 +; SSE41-NEXT: .cfi_offset %rbx, -56 +; SSE41-NEXT: .cfi_offset %r12, -48 +; SSE41-NEXT: .cfi_offset %r13, -40 +; SSE41-NEXT: .cfi_offset %r14, -32 +; SSE41-NEXT: .cfi_offset %r15, -24 +; SSE41-NEXT: .cfi_offset %rbp, -16 +; SSE41-NEXT: movl %r9d, %r12d +; SSE41-NEXT: movl %r8d, %r15d +; SSE41-NEXT: movl %ecx, %r14d +; SSE41-NEXT: movl %edx, %r13d +; SSE41-NEXT: movq %rdi, %rbx +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp +; SSE41-NEXT: movzwl %si, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movzwl %r13w, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movzwl %r14w, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movzwl %r15w, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movzwl %r12w, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl %ebp, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, 30(%rbx) +; SSE41-NEXT: movw %r12w, 28(%rbx) +; SSE41-NEXT: movw %r15w, 26(%rbx) +; SSE41-NEXT: movw %r14w, 24(%rbx) +; SSE41-NEXT: movw %r13w, 22(%rbx) +; SSE41-NEXT: movw %bp, 20(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 18(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 16(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 14(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 12(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 10(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 8(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 6(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 4(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 2(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, (%rbx) +; SSE41-NEXT: movq %rbx, %rax +; SSE41-NEXT: addq $56, %rsp +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: roundeven_v16f16: +; AVX1: ## %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: .cfi_def_cfa_offset 24 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: .cfi_def_cfa_offset 40 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: .cfi_def_cfa_offset 56 +; AVX1-NEXT: subq $72, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 128 +; AVX1-NEXT: .cfi_offset %rbx, -56 +; AVX1-NEXT: .cfi_offset %r12, -48 +; AVX1-NEXT: .cfi_offset %r13, -40 +; AVX1-NEXT: .cfi_offset %r14, -32 +; AVX1-NEXT: .cfi_offset %r15, -24 +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movl %r9d, %ebp +; AVX1-NEXT: movl %r8d, %r15d +; AVX1-NEXT: movl %ecx, %r14d +; AVX1-NEXT: movl %edx, %r13d +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d +; AVX1-NEXT: movzwl %si, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vmovaps %xmm2, (%rsp) ## 16-byte Spill +; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movzwl %r13w, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movzwl %r14w, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movzwl %r15w, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movzwl %bp, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl %r12d, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, 30(%rbx) +; AVX1-NEXT: movw %r12w, 28(%rbx) +; AVX1-NEXT: movw %r15w, 26(%rbx) +; AVX1-NEXT: movw %r14w, 24(%rbx) +; AVX1-NEXT: movw %r13w, 22(%rbx) +; AVX1-NEXT: movw %bp, 20(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 18(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 16(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 14(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 12(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 10(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 8(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 6(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 4(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 2(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, (%rbx) +; AVX1-NEXT: movq %rbx, %rax +; AVX1-NEXT: addq $72, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX512F-LABEL: roundeven_v16f16: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movzwl %si, %esi +; AVX512F-NEXT: vmovd %esi, %xmm8 +; AVX512F-NEXT: movzwl %dx, %edx +; AVX512F-NEXT: vmovd %edx, %xmm9 +; AVX512F-NEXT: movzwl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm10 +; AVX512F-NEXT: movzwl %r8w, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm11 +; AVX512F-NEXT: movzwl %r9w, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm12 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm13 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm14 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm15 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm16 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm17 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm18 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm19 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm20 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm22 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm23 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm21 +; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm3 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm24 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm25 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm3, %xmm2 +; AVX512F-NEXT: vaddss %xmm2, %xmm3, %xmm8 +; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm2 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm2, %xmm3 +; AVX512F-NEXT: vaddss %xmm3, %xmm2, %xmm9 +; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm3 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm3, %xmm4 +; AVX512F-NEXT: vaddss %xmm4, %xmm3, %xmm10 +; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm4 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm7 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm4, %xmm7 +; AVX512F-NEXT: vaddss %xmm7, %xmm4, %xmm11 +; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm5 +; AVX512F-NEXT: vaddss %xmm5, %xmm7, %xmm12 +; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm7, %xmm13 +; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm0 +; AVX512F-NEXT: vaddss %xmm0, %xmm7, %xmm14 +; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm1 +; AVX512F-NEXT: vaddss %xmm1, %xmm7, %xmm15 +; AVX512F-NEXT: vcvtph2ps %xmm16, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm2 +; AVX512F-NEXT: vaddss %xmm2, %xmm7, %xmm16 +; AVX512F-NEXT: vcvtph2ps %xmm17, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm3 +; AVX512F-NEXT: vaddss %xmm3, %xmm7, %xmm17 +; AVX512F-NEXT: vcvtph2ps %xmm18, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm4 +; AVX512F-NEXT: vaddss %xmm4, %xmm7, %xmm18 +; AVX512F-NEXT: vcvtph2ps %xmm19, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm5 +; AVX512F-NEXT: vaddss %xmm5, %xmm7, %xmm19 +; AVX512F-NEXT: vcvtph2ps %xmm20, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm7, %xmm20 +; AVX512F-NEXT: vcvtph2ps %xmm22, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm0 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm0 +; AVX512F-NEXT: vaddss %xmm0, %xmm7, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm23, %xmm7 +; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm7, %xmm1 +; AVX512F-NEXT: vaddss %xmm1, %xmm7, %xmm1 +; AVX512F-NEXT: vroundss $11, %xmm8, %xmm8, %xmm7 +; AVX512F-NEXT: vcvtps2ph $4, %xmm7, %xmm8 +; AVX512F-NEXT: vroundss $11, %xmm9, %xmm9, %xmm7 +; AVX512F-NEXT: vcvtps2ph $4, %xmm7, %xmm9 +; AVX512F-NEXT: vroundss $11, %xmm10, %xmm10, %xmm7 +; AVX512F-NEXT: vcvtps2ph $4, %xmm7, %xmm10 +; AVX512F-NEXT: vroundss $11, %xmm11, %xmm11, %xmm7 +; AVX512F-NEXT: vcvtps2ph $4, %xmm7, %xmm11 +; AVX512F-NEXT: vroundss $11, %xmm12, %xmm12, %xmm2 +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm12 +; AVX512F-NEXT: vroundss $11, %xmm13, %xmm13, %xmm7 +; AVX512F-NEXT: vcvtps2ph $4, %xmm7, %xmm13 +; AVX512F-NEXT: vroundss $11, %xmm14, %xmm14, %xmm3 +; AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm14 +; AVX512F-NEXT: vroundss $11, %xmm15, %xmm15, %xmm2 +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm15 +; AVX512F-NEXT: vrndscaless $11, %xmm16, %xmm16, %xmm4 +; AVX512F-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512F-NEXT: vrndscaless $11, %xmm17, %xmm17, %xmm5 +; AVX512F-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; AVX512F-NEXT: vrndscaless $11, %xmm18, %xmm18, %xmm7 +; AVX512F-NEXT: vcvtps2ph $4, %xmm7, %xmm7 +; AVX512F-NEXT: vrndscaless $11, %xmm19, %xmm19, %xmm6 +; AVX512F-NEXT: vcvtps2ph $4, %xmm6, %xmm6 +; AVX512F-NEXT: vrndscaless $11, %xmm20, %xmm20, %xmm3 +; AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vroundss $11, %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm21, %xmm2 +; AVX512F-NEXT: vpternlogd $248, %xmm24, %xmm2, %xmm25 +; AVX512F-NEXT: vaddss %xmm25, %xmm2, %xmm2 +; AVX512F-NEXT: vroundss $11, %xmm2, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512F-NEXT: vpextrw $0, %xmm2, 30(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm1, 28(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm0, 26(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm3, 24(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm6, 22(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm7, 20(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm5, 18(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm4, 16(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm15, 14(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm14, 12(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm13, 10(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm12, 8(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm11, 6(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm10, 4(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm9, 2(%rdi) +; AVX512F-NEXT: vpextrw $0, %xmm8, (%rdi) +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: roundeven_v16f16: +; AVX512FP16: ## %bb.0: +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} ymm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] +; AVX512FP16-NEXT: vpternlogq $248, %ymm1, %ymm0, %ymm2 +; AVX512FP16-NEXT: vaddph %ymm2, %ymm0, %ymm0 +; AVX512FP16-NEXT: vrndscaleph $11, %ymm0, %ymm0 +; AVX512FP16-NEXT: retq + %a = call <16 x half> @llvm.round.v16f16(<16 x half> %x) + ret <16 x half> %a +} + +define <8 x float> @round_v8f32(<8 x float> %x) { +; SSE2-LABEL: round_v8f32: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $72, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 80 ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill -; SSE2-NEXT: callq _round +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: callq _round +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] ; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: callq _round +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE2-NEXT: callq _roundf ; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload ; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: callq _round -; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload -; SSE2-NEXT: addq $56, %rsp +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: addq $72, %rsp ; SSE2-NEXT: retq ; -; SSE41-LABEL: round_v4f64: +; SSE41-LABEL: round_v8f32: ; SSE41: ## %bb.0: -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] -; SSE41-NEXT: movapd %xmm0, %xmm3 -; SSE41-NEXT: andpd %xmm2, %xmm3 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4.9999999999999994E-1,4.9999999999999994E-1] -; SSE41-NEXT: orpd %xmm4, %xmm3 -; SSE41-NEXT: addpd %xmm0, %xmm3 -; SSE41-NEXT: roundpd $11, %xmm3, %xmm0 -; SSE41-NEXT: andpd %xmm1, %xmm2 -; SSE41-NEXT: orpd %xmm4, %xmm2 -; SSE41-NEXT: addpd %xmm1, %xmm2 -; SSE41-NEXT: roundpd $11, %xmm2, %xmm1 +; SSE41-NEXT: movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: movaps {{.*#+}} xmm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; SSE41-NEXT: orps %xmm4, %xmm3 +; SSE41-NEXT: addps %xmm0, %xmm3 +; SSE41-NEXT: roundps $11, %xmm3, %xmm0 +; SSE41-NEXT: andps %xmm1, %xmm2 +; SSE41-NEXT: orps %xmm4, %xmm2 +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: roundps $11, %xmm2, %xmm1 ; SSE41-NEXT: retq ; -; AVX1-LABEL: round_v4f64: +; AVX1-LABEL: round_v8f32: ; AVX1: ## %bb.0: -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 -; AVX1-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 -; AVX1-NEXT: retq -; +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: round_v8f32: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX512-NEXT: retq + %a = call <8 x float> @llvm.round.v8f32(<8 x float> %x) + ret <8 x float> %a +} + +define <4 x double> @round_v4f64(<4 x double> %x) { +; SSE2-LABEL: round_v4f64: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $56, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 64 +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: addq $56, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: round_v4f64: +; SSE41: ## %bb.0: +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4.9999999999999994E-1,4.9999999999999994E-1] +; SSE41-NEXT: orpd %xmm4, %xmm3 +; SSE41-NEXT: addpd %xmm0, %xmm3 +; SSE41-NEXT: roundpd $11, %xmm3, %xmm0 +; SSE41-NEXT: andpd %xmm1, %xmm2 +; SSE41-NEXT: orpd %xmm4, %xmm2 +; SSE41-NEXT: addpd %xmm1, %xmm2 +; SSE41-NEXT: roundpd $11, %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: round_v4f64: +; AVX1: ## %bb.0: +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX1-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX1-NEXT: retq +; ; AVX512-LABEL: round_v4f64: ; AVX512: ## %bb.0: ; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] @@ -394,6 +1630,1544 @@ ret <4 x double> %a } +define <32 x half> @roundeven_v32f16(<32 x half> %x) { +; SSE2-LABEL: roundeven_v32f16: +; SSE2: ## %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %r15 +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: pushq %r13 +; SSE2-NEXT: .cfi_def_cfa_offset 40 +; SSE2-NEXT: pushq %r12 +; SSE2-NEXT: .cfi_def_cfa_offset 48 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 56 +; SSE2-NEXT: subq $120, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 176 +; SSE2-NEXT: .cfi_offset %rbx, -56 +; SSE2-NEXT: .cfi_offset %r12, -48 +; SSE2-NEXT: .cfi_offset %r13, -40 +; SSE2-NEXT: .cfi_offset %r14, -32 +; SSE2-NEXT: .cfi_offset %r15, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movl %r8d, %ebp +; SSE2-NEXT: movl %ecx, %r13d +; SSE2-NEXT: movl %edx, %r14d +; SSE2-NEXT: movq %rdi, %rbx +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; SSE2-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d +; SSE2-NEXT: movzwl %si, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movzwl %r14w, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movzwl %r13w, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movzwl %bp, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 2-byte Folded Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl %r12d, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl %r15d, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r14d +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r13d +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %ebp +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r12d +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movl %eax, %r15d +; SSE2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: movw %ax, 62(%rbx) +; SSE2-NEXT: movw %r15w, 60(%rbx) +; SSE2-NEXT: movw %r12w, 58(%rbx) +; SSE2-NEXT: movw %bp, 56(%rbx) +; SSE2-NEXT: movw %r13w, 54(%rbx) +; SSE2-NEXT: movw %r14w, 52(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 50(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 48(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 46(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 44(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 42(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 40(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 38(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 36(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 34(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 32(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 30(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 28(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 26(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 24(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 22(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 20(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 18(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 16(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 14(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 12(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 10(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 8(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 6(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 4(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, 2(%rbx) +; SSE2-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE2-NEXT: movw %ax, (%rbx) +; SSE2-NEXT: movq %rbx, %rax +; SSE2-NEXT: addq $120, %rsp +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: popq %r12 +; SSE2-NEXT: popq %r13 +; SSE2-NEXT: popq %r14 +; SSE2-NEXT: popq %r15 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: retq +; +; SSE41-LABEL: roundeven_v32f16: +; SSE41: ## %bb.0: +; SSE41-NEXT: pushq %rbp +; SSE41-NEXT: .cfi_def_cfa_offset 16 +; SSE41-NEXT: pushq %r15 +; SSE41-NEXT: .cfi_def_cfa_offset 24 +; SSE41-NEXT: pushq %r14 +; SSE41-NEXT: .cfi_def_cfa_offset 32 +; SSE41-NEXT: pushq %r13 +; SSE41-NEXT: .cfi_def_cfa_offset 40 +; SSE41-NEXT: pushq %r12 +; SSE41-NEXT: .cfi_def_cfa_offset 48 +; SSE41-NEXT: pushq %rbx +; SSE41-NEXT: .cfi_def_cfa_offset 56 +; SSE41-NEXT: subq $120, %rsp +; SSE41-NEXT: .cfi_def_cfa_offset 176 +; SSE41-NEXT: .cfi_offset %rbx, -56 +; SSE41-NEXT: .cfi_offset %r12, -48 +; SSE41-NEXT: .cfi_offset %r13, -40 +; SSE41-NEXT: .cfi_offset %r14, -32 +; SSE41-NEXT: .cfi_offset %r15, -24 +; SSE41-NEXT: .cfi_offset %rbp, -16 +; SSE41-NEXT: movl %r9d, %r15d +; SSE41-NEXT: movl %r8d, %ebp +; SSE41-NEXT: movl %ecx, %r13d +; SSE41-NEXT: movl %edx, %r14d +; SSE41-NEXT: movq %rdi, %rbx +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; SSE41-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d +; SSE41-NEXT: movzwl %si, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movzwl %r14w, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movzwl %r13w, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movzwl %bp, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movzwl %r15w, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl %r12d, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r14d +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r13d +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %ebp +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r12d +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps %xmm0, %xmm1 +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movl %eax, %r15d +; SSE41-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm2, %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: movw %ax, 62(%rbx) +; SSE41-NEXT: movw %r15w, 60(%rbx) +; SSE41-NEXT: movw %r12w, 58(%rbx) +; SSE41-NEXT: movw %bp, 56(%rbx) +; SSE41-NEXT: movw %r13w, 54(%rbx) +; SSE41-NEXT: movw %r14w, 52(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 50(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 48(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 46(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 44(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 42(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 40(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 38(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 36(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 34(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 32(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 30(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 28(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 26(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 24(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 22(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 20(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 18(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 16(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 14(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 12(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 10(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 8(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 6(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 4(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, 2(%rbx) +; SSE41-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; SSE41-NEXT: movw %ax, (%rbx) +; SSE41-NEXT: movq %rbx, %rax +; SSE41-NEXT: addq $120, %rsp +; SSE41-NEXT: popq %rbx +; SSE41-NEXT: popq %r12 +; SSE41-NEXT: popq %r13 +; SSE41-NEXT: popq %r14 +; SSE41-NEXT: popq %r15 +; SSE41-NEXT: popq %rbp +; SSE41-NEXT: retq +; +; AVX1-LABEL: roundeven_v32f16: +; AVX1: ## %bb.0: +; AVX1-NEXT: pushq %rbp +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: pushq %r15 +; AVX1-NEXT: .cfi_def_cfa_offset 24 +; AVX1-NEXT: pushq %r14 +; AVX1-NEXT: .cfi_def_cfa_offset 32 +; AVX1-NEXT: pushq %r13 +; AVX1-NEXT: .cfi_def_cfa_offset 40 +; AVX1-NEXT: pushq %r12 +; AVX1-NEXT: .cfi_def_cfa_offset 48 +; AVX1-NEXT: pushq %rbx +; AVX1-NEXT: .cfi_def_cfa_offset 56 +; AVX1-NEXT: subq $136, %rsp +; AVX1-NEXT: .cfi_def_cfa_offset 192 +; AVX1-NEXT: .cfi_offset %rbx, -56 +; AVX1-NEXT: .cfi_offset %r12, -48 +; AVX1-NEXT: .cfi_offset %r13, -40 +; AVX1-NEXT: .cfi_offset %r14, -32 +; AVX1-NEXT: .cfi_offset %r15, -24 +; AVX1-NEXT: .cfi_offset %rbp, -16 +; AVX1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movl %r8d, %r12d +; AVX1-NEXT: movl %ecx, %r13d +; AVX1-NEXT: movl %edx, %r14d +; AVX1-NEXT: movq %rdi, %rbx +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; AVX1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; AVX1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp +; AVX1-NEXT: movzwl %si, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vmovaps %xmm2, (%rsp) ## 16-byte Spill +; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movzwl %r14w, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movzwl %r13w, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movzwl %r12w, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 2-byte Folded Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl %ebp, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl %r15d, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r14d +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r13d +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %ebp +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r12d +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movl %eax, %r15d +; AVX1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi ## 4-byte Reload +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps (%rsp), %xmm1, %xmm1 ## 16-byte Folded Reload +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: movw %ax, 62(%rbx) +; AVX1-NEXT: movw %r15w, 60(%rbx) +; AVX1-NEXT: movw %r12w, 58(%rbx) +; AVX1-NEXT: movw %bp, 56(%rbx) +; AVX1-NEXT: movw %r13w, 54(%rbx) +; AVX1-NEXT: movw %r14w, 52(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 50(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 48(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 46(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 44(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 42(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 40(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 38(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 36(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 34(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 32(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 30(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 28(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 26(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 24(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 22(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 20(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 18(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 16(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 14(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 12(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 10(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 8(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 6(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 4(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, 2(%rbx) +; AVX1-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax ## 2-byte Folded Reload +; AVX1-NEXT: movw %ax, (%rbx) +; AVX1-NEXT: movq %rbx, %rax +; AVX1-NEXT: addq $136, %rsp +; AVX1-NEXT: popq %rbx +; AVX1-NEXT: popq %r12 +; AVX1-NEXT: popq %r13 +; AVX1-NEXT: popq %r14 +; AVX1-NEXT: popq %r15 +; AVX1-NEXT: popq %rbp +; AVX1-NEXT: retq +; +; AVX512F-LABEL: roundeven_v32f16: +; AVX512F: ## %bb.0: +; AVX512F-NEXT: movq %rdi, %rax +; AVX512F-NEXT: movzwl %si, %esi +; AVX512F-NEXT: vmovd %esi, %xmm8 +; AVX512F-NEXT: movzwl %dx, %edx +; AVX512F-NEXT: vmovd %edx, %xmm2 +; AVX512F-NEXT: movzwl %cx, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm10 +; AVX512F-NEXT: movzwl %r8w, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm11 +; AVX512F-NEXT: movzwl %r9w, %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm12 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm13 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm14 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm15 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm16 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm17 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm4 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm19 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm21 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm3 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm5 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm9 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm0 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm29 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm30 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm31 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm24 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm25 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm18 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm20 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm27 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm6 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm7 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512F-NEXT: vmovdqa %xmm8, %xmm1 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm1 +; AVX512F-NEXT: vaddss %xmm1, %xmm6, %xmm1 +; AVX512F-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm2 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm2 +; AVX512F-NEXT: vaddss %xmm2, %xmm6, %xmm1 +; AVX512F-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX512F-NEXT: vcvtph2ps %xmm10, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm2, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm2, %xmm1 +; AVX512F-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm2, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm2, %xmm1 +; AVX512F-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm2, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm2, %xmm1 +; AVX512F-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm2, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm2, %xmm1 +; AVX512F-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill +; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm2, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm2, %xmm28 +; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm2, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm2, %xmm26 +; AVX512F-NEXT: vcvtph2ps %xmm16, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm2, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm2, %xmm23 +; AVX512F-NEXT: vcvtph2ps %xmm17, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm2, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm2, %xmm22 +; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm2, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm2, %xmm17 +; AVX512F-NEXT: vcvtph2ps %xmm19, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm4 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm4 +; AVX512F-NEXT: vaddss %xmm4, %xmm6, %xmm19 +; AVX512F-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 ## 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm4, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm4, %xmm16 +; AVX512F-NEXT: vcvtph2ps %xmm21, %xmm4 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm1 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm4, %xmm1 +; AVX512F-NEXT: vaddss %xmm1, %xmm4, %xmm21 +; AVX512F-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa %xmm8, %xmm4 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm1, %xmm4 +; AVX512F-NEXT: vaddss %xmm4, %xmm1, %xmm15 +; AVX512F-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa %xmm8, %xmm2 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm1, %xmm2 +; AVX512F-NEXT: vaddss %xmm2, %xmm1, %xmm14 +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm3 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm2, %xmm3 +; AVX512F-NEXT: vaddss %xmm3, %xmm2, %xmm13 +; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm3 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm5 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm3, %xmm5 +; AVX512F-NEXT: vaddss %xmm5, %xmm3, %xmm12 +; AVX512F-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 ## 16-byte Folded Reload +; AVX512F-NEXT: vmovdqa %xmm8, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm5, %xmm6 +; AVX512F-NEXT: vaddss %xmm6, %xmm5, %xmm11 +; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm4 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm4 +; AVX512F-NEXT: vaddss %xmm4, %xmm6, %xmm10 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm1 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm1 +; AVX512F-NEXT: vaddss %xmm1, %xmm6, %xmm9 +; AVX512F-NEXT: vcvtph2ps %xmm29, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm2 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm2 +; AVX512F-NEXT: vaddss %xmm2, %xmm6, %xmm29 +; AVX512F-NEXT: vcvtph2ps %xmm30, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm0 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm0 +; AVX512F-NEXT: vaddss %xmm0, %xmm6, %xmm30 +; AVX512F-NEXT: vcvtph2ps %xmm31, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm3 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm3 +; AVX512F-NEXT: vaddss %xmm3, %xmm6, %xmm31 +; AVX512F-NEXT: vcvtph2ps %xmm24, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm5 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm5 +; AVX512F-NEXT: vaddss %xmm5, %xmm6, %xmm24 +; AVX512F-NEXT: vcvtph2ps %xmm25, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm4 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm4 +; AVX512F-NEXT: vaddss %xmm4, %xmm6, %xmm4 +; AVX512F-NEXT: vcvtph2ps %xmm18, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm1 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm1 +; AVX512F-NEXT: vaddss %xmm1, %xmm6, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm20, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm2 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm2 +; AVX512F-NEXT: vaddss %xmm2, %xmm6, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm27, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm0 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm0 +; AVX512F-NEXT: vaddss %xmm0, %xmm6, %xmm0 +; AVX512F-NEXT: vmovd %ecx, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm3 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm3 +; AVX512F-NEXT: vaddss %xmm3, %xmm6, %xmm3 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512F-NEXT: vmovdqa %xmm8, %xmm5 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm5 +; AVX512F-NEXT: vaddss %xmm5, %xmm6, %xmm5 +; AVX512F-NEXT: movzwl {{[0-9]+}}(%rsp), %ecx +; AVX512F-NEXT: vmovd %ecx, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm6 +; AVX512F-NEXT: vpternlogd $248, %xmm7, %xmm6, %xmm8 +; AVX512F-NEXT: vaddss %xmm6, %xmm8, %xmm6 +; AVX512F-NEXT: vroundss $11, %xmm6, %xmm6, %xmm6 +; AVX512F-NEXT: vcvtps2ph $4, %xmm6, %xmm6 +; AVX512F-NEXT: vpextrw $0, %xmm6, 62(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm5, %xmm5, %xmm5 +; AVX512F-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; AVX512F-NEXT: vpextrw $0, %xmm5, 60(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512F-NEXT: vpextrw $0, %xmm3, 58(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 56(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm2, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 54(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm1, %xmm1, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 52(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm4, %xmm4, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 50(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm24, %xmm24, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 48(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm31, %xmm31, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 46(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm30, %xmm30, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 44(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm29, %xmm29, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 42(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm9, %xmm9, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 40(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm10, %xmm10, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 38(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm11, %xmm11, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 36(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm12, %xmm12, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 34(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm13, %xmm13, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 32(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm14, %xmm14, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 30(%rdi) +; AVX512F-NEXT: vroundss $11, %xmm15, %xmm15, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 28(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm21, %xmm21, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 26(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm16, %xmm16, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 24(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm19, %xmm19, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 22(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm17, %xmm17, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 20(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm22, %xmm22, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 18(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm23, %xmm23, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 16(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm26, %xmm26, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 14(%rdi) +; AVX512F-NEXT: vrndscaless $11, %xmm28, %xmm28, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 12(%rdi) +; AVX512F-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; AVX512F-NEXT: ## xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 10(%rdi) +; AVX512F-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; AVX512F-NEXT: ## xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 8(%rdi) +; AVX512F-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; AVX512F-NEXT: ## xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 6(%rdi) +; AVX512F-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; AVX512F-NEXT: ## xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 4(%rdi) +; AVX512F-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; AVX512F-NEXT: ## xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, 2(%rdi) +; AVX512F-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload +; AVX512F-NEXT: ## xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: roundeven_v32f16: +; AVX512FP16: ## %bb.0: +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] +; AVX512FP16-NEXT: vpternlogq $248, %zmm1, %zmm0, %zmm2 +; AVX512FP16-NEXT: vaddph %zmm2, %zmm0, %zmm0 +; AVX512FP16-NEXT: vrndscaleph $11, %zmm0, %zmm0 +; AVX512FP16-NEXT: retq + %a = call <32 x half> @llvm.round.v32f16(<32 x half> %x) + ret <32 x half> %a +} + define <16 x float> @round_v16f32(<16 x float> %x) { ; SSE2-LABEL: round_v16f32: ; SSE2: ## %bb.0: @@ -643,9 +3417,12 @@ declare half @llvm.round.f16(half) declare float @llvm.round.f32(float) declare double @llvm.round.f64(double) +declare <8 x half> @llvm.round.v8f16(<8 x half>) declare <4 x float> @llvm.round.v4f32(<4 x float>) declare <2 x double> @llvm.round.v2f64(<2 x double>) +declare <16 x half> @llvm.round.v16f16(<16 x half>) declare <8 x float> @llvm.round.v8f32(<8 x float>) declare <4 x double> @llvm.round.v4f64(<4 x double>) +declare <32 x half> @llvm.round.v32f16(<32 x half>) declare <16 x float> @llvm.round.v16f32(<16 x float>) declare <8 x double> @llvm.round.v8f64(<8 x double>)