diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -631,6 +631,34 @@ setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, LibCall); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, LibCall); + setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll @@ -1,8 +1,58 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-64 define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_oeq_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovnel %ebx, %ebp +; SSE2-NEXT: cmovpl %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_oeq_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovnel %esi, %eax +; AVX-NEXT: cmovpl %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_oeq_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -29,6 +79,51 @@ } define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ogt_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovbel %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ogt_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovbel %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ogt_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -53,6 +148,51 @@ } define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_oge_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovbl %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_oge_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovbl %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_oge_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -77,6 +217,53 @@ } define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_olt_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: ucomiss %xmm0, %xmm1 +; SSE2-NEXT: cmovbel %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_olt_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: vpextrw $0, %xmm0, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovbel %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_olt_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -101,6 +288,53 @@ } define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ole_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: ucomiss %xmm0, %xmm1 +; SSE2-NEXT: cmovbl %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ole_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: vpextrw $0, %xmm0, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovbl %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ole_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -125,6 +359,51 @@ } define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_one_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovel %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_one_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovel %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_one_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -149,6 +428,51 @@ } define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ord_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovpl %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ord_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovpl %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ord_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -173,6 +497,51 @@ } define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ueq_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovnel %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ueq_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovnel %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ueq_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -197,6 +566,53 @@ } define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ugt_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: ucomiss %xmm0, %xmm1 +; SSE2-NEXT: cmovael %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ugt_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: vpextrw $0, %xmm0, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovael %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ugt_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -221,6 +637,53 @@ } define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_uge_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: ucomiss %xmm0, %xmm1 +; SSE2-NEXT: cmoval %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_uge_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: vpextrw $0, %xmm0, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmoval %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_uge_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -245,6 +708,51 @@ } define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ult_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovael %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ult_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovael %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ult_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -269,6 +777,51 @@ } define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ule_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmoval %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ule_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmoval %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ule_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -293,6 +846,53 @@ } define i32 @test_f16_une_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_une_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovnel %ebp, %ebx +; SSE2-NEXT: cmovpl %ebp, %ebx +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_une_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovnel %edi, %eax +; AVX-NEXT: cmovpl %edi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_une_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -319,6 +919,51 @@ } define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_uno_q: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovnpl %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_uno_q: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: cmovnpl %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_uno_q: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -343,6 +988,53 @@ } define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_oeq_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovnel %ebx, %ebp +; SSE2-NEXT: cmovpl %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_oeq_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovnel %esi, %eax +; AVX-NEXT: cmovpl %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_oeq_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -369,6 +1061,51 @@ } define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ogt_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovbel %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ogt_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovbel %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ogt_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -393,6 +1130,51 @@ } define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_oge_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovbl %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_oge_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovbl %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_oge_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -417,6 +1199,53 @@ } define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_olt_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: comiss %xmm0, %xmm1 +; SSE2-NEXT: cmovbel %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_olt_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: vpextrw $0, %xmm0, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovbel %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_olt_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -441,6 +1270,53 @@ } define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ole_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: comiss %xmm0, %xmm1 +; SSE2-NEXT: cmovbl %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ole_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: vpextrw $0, %xmm0, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovbl %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ole_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -465,6 +1341,51 @@ } define i32 @test_f16_one_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_one_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovel %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_one_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovel %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_one_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -489,6 +1410,51 @@ } define i32 @test_f16_ord_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ord_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovpl %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ord_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovpl %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ord_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -513,6 +1479,51 @@ } define i32 @test_f16_ueq_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ueq_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovnel %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ueq_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovnel %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ueq_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -537,6 +1548,53 @@ } define i32 @test_f16_ugt_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ugt_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: comiss %xmm0, %xmm1 +; SSE2-NEXT: cmovael %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ugt_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: vpextrw $0, %xmm0, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovael %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ugt_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -561,6 +1619,53 @@ } define i32 @test_f16_uge_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_uge_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss (%rsp), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: comiss %xmm0, %xmm1 +; SSE2-NEXT: cmoval %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_uge_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: vpextrw $0, %xmm0, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmoval %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_uge_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -585,6 +1690,51 @@ } define i32 @test_f16_ult_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ult_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovael %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ult_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovael %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ult_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -609,6 +1759,51 @@ } define i32 @test_f16_ule_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_ule_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmoval %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_ule_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmoval %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_ule_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -633,6 +1828,53 @@ } define i32 @test_f16_une_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_une_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovnel %ebp, %ebx +; SSE2-NEXT: cmovpl %ebp, %ebx +; SSE2-NEXT: movl %ebx, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_une_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %esi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovnel %edi, %eax +; AVX-NEXT: cmovpl %edi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_une_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -659,6 +1901,51 @@ } define i32 @test_f16_uno_s(i32 %a, i32 %b, half %f1, half %f2) #0 { +; SSE2-LABEL: test_f16_uno_s: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 32 +; SSE2-NEXT: .cfi_offset %rbx, -24 +; SSE2-NEXT: .cfi_offset %rbp, -16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movl %esi, %ebx +; SSE2-NEXT: movl %edi, %ebp +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: comiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: cmovnpl %ebx, %ebp +; SSE2-NEXT: movl %ebp, %eax +; SSE2-NEXT: addq $8, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 24 +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rbp +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: test_f16_uno_s: +; AVX: # %bb.0: +; AVX-NEXT: movl %edi, %eax +; AVX-NEXT: vpextrw $0, %xmm0, %ecx +; AVX-NEXT: vpextrw $0, %xmm1, %edx +; AVX-NEXT: movzwl %dx, %edx +; AVX-NEXT: vmovd %edx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vcomiss %xmm0, %xmm1 +; AVX-NEXT: cmovnpl %esi, %eax +; AVX-NEXT: retq +; ; CHECK-32-LABEL: test_f16_uno_s: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -683,6 +1970,46 @@ } define void @foo(half %0, half %1) #0 { +; SSE2-LABEL: foo: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: ucomiss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: jbe .LBB28_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: jmp bar@PLT # TAILCALL +; SSE2-NEXT: .LBB28_1: +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: foo: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vucomiss %xmm0, %xmm1 +; AVX-NEXT: jbe .LBB28_1 +; AVX-NEXT: # %bb.2: +; AVX-NEXT: jmp bar@PLT # TAILCALL +; AVX-NEXT: .LBB28_1: +; AVX-NEXT: retq +; ; CHECK-32-LABEL: foo: ; CHECK-32: # %bb.0: ; CHECK-32-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll @@ -1,4 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX,F16C +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64 @@ -14,6 +17,39 @@ declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata) define half @fadd_f16(half %a, half %b) nounwind strictfp { +; SSE2-LABEL: fadd_f16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: addss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: retq +; +; AVX-LABEL: fadd_f16: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: fadd_f16: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -31,6 +67,39 @@ } define half @fsub_f16(half %a, half %b) nounwind strictfp { +; SSE2-LABEL: fsub_f16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: subss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: retq +; +; AVX-LABEL: fsub_f16: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: fsub_f16: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -48,6 +117,39 @@ } define half @fmul_f16(half %a, half %b) nounwind strictfp { +; SSE2-LABEL: fmul_f16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: mulss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: retq +; +; AVX-LABEL: fmul_f16: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: fmul_f16: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -65,6 +167,39 @@ } define half @fdiv_f16(half %a, half %b) nounwind strictfp { +; SSE2-LABEL: fdiv_f16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: divss (%rsp), %xmm0 # 4-byte Folded Reload +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: retq +; +; AVX-LABEL: fdiv_f16: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: vpextrw $0, %xmm1, %ecx +; AVX-NEXT: movzwl %cx, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: fdiv_f16: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -82,6 +217,24 @@ } define void @fpext_f16_to_f32(half* %val, float* %ret) nounwind strictfp { +; SSE2-LABEL: fpext_f16_to_f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq %rsi, %rbx +; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movd %xmm0, (%rbx) +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: retq +; +; AVX-LABEL: fpext_f16_to_f32: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vmovss %xmm0, (%rsi) +; AVX-NEXT: retq +; ; X86-LABEL: fpext_f16_to_f32: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -105,6 +258,26 @@ } define void @fpext_f16_to_f64(half* %val, double* %ret) nounwind strictfp { +; SSE2-LABEL: fpext_f16_to_f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq %rsi, %rbx +; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: cvtss2sd %xmm0, %xmm0 +; SSE2-NEXT: movsd %xmm0, (%rbx) +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: retq +; +; AVX-LABEL: fpext_f16_to_f64: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovsd %xmm0, (%rsi) +; AVX-NEXT: retq +; ; X86-LABEL: fpext_f16_to_f64: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -128,6 +301,25 @@ } define void @fptrunc_float_to_f16(float* %val, half *%ret) nounwind strictfp { +; SSE2-LABEL: fptrunc_float_to_f16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq %rsi, %rbx +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rbx) +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: retq +; +; AVX-LABEL: fptrunc_float_to_f16: +; AVX: # %bb.0: +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: movw %ax, (%rsi) +; AVX-NEXT: retq +; ; X86-LABEL: fptrunc_float_to_f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -152,6 +344,28 @@ } define void @fptrunc_double_to_f16(double* %val, half *%ret) nounwind strictfp { +; SSE2-LABEL: fptrunc_double_to_f16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq %rsi, %rbx +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: callq __truncdfhf2@PLT +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rbx) +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: retq +; +; AVX-LABEL: fptrunc_double_to_f16: +; AVX: # %bb.0: +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: movw %ax, (%rsi) +; AVX-NEXT: retq +; ; X86-LABEL: fptrunc_double_to_f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -176,6 +390,32 @@ } define void @fsqrt_f16(half* %a) nounwind strictfp { +; SSE2-LABEL: fsqrt_f16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: movq %rdi, %rbx +; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: sqrtss %xmm0, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movw %ax, (%rbx) +; SSE2-NEXT: popq %rbx +; SSE2-NEXT: retq +; +; AVX-LABEL: fsqrt_f16: +; AVX: # %bb.0: +; AVX-NEXT: movzwl (%rdi), %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: movw %ax, (%rdi) +; AVX-NEXT: retq +; ; X86-LABEL: fsqrt_f16: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -199,6 +439,76 @@ } define half @fma_f16(half %a, half %b, half %c) nounwind strictfp { +; SSE2-LABEL: fma_f16: +; SSE2: # %bb.0: +; SSE2-NEXT: subq $24, %rsp +; SSE2-NEXT: movss %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; SSE2-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload +; SSE2-NEXT: # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: callq fmaf@PLT +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: addq $24, %rsp +; SSE2-NEXT: retq +; +; F16C-LABEL: fma_f16: +; F16C: # %bb.0: +; F16C-NEXT: pushq %rax +; F16C-NEXT: vpextrw $0, %xmm0, %eax +; F16C-NEXT: vpextrw $0, %xmm1, %ecx +; F16C-NEXT: vpextrw $0, %xmm2, %edx +; F16C-NEXT: movzwl %dx, %edx +; F16C-NEXT: vmovd %edx, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm2 +; F16C-NEXT: movzwl %cx, %ecx +; F16C-NEXT: vmovd %ecx, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm1 +; F16C-NEXT: movzwl %ax, %eax +; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: callq fmaf@PLT +; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; F16C-NEXT: popq %rax +; F16C-NEXT: retq +; +; AVX512-LABEL: fma_f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrw $0, %xmm1, %eax +; AVX512-NEXT: vpextrw $0, %xmm0, %ecx +; AVX512-NEXT: vpextrw $0, %xmm2, %edx +; AVX512-NEXT: movzwl %dx, %edx +; AVX512-NEXT: vmovd %edx, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: movzwl %cx, %ecx +; AVX512-NEXT: vmovd %ecx, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %eax, %xmm2 +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm2 = (xmm1 * xmm2) + xmm0 +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512-NEXT: retq +; ; X86-LABEL: fma_f16: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll @@ -1,4 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX,F16C +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64 @@ -14,6 +17,27 @@ declare i64 @llvm.experimental.constrained.fptoui.i64.f16(half, metadata) define i1 @fptosi_f16toi1(half %x) #0 { +; SSE2-LABEL: fptosi_f16toi1: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: cvttss2si %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fptosi_f16toi1: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vcvttss2si %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; ; X86-LABEL: fptosi_f16toi1: ; X86: # %bb.0: ; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax @@ -31,6 +55,27 @@ } define i8 @fptosi_f16toi8(half %x) #0 { +; SSE2-LABEL: fptosi_f16toi8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: cvttss2si %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fptosi_f16toi8: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vcvttss2si %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; ; X86-LABEL: fptosi_f16toi8: ; X86: # %bb.0: ; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax @@ -48,6 +93,27 @@ } define i16 @fptosi_f16toi16(half %x) #0 { +; SSE2-LABEL: fptosi_f16toi16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: cvttss2si %xmm0, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fptosi_f16toi16: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vcvttss2si %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; ; X86-LABEL: fptosi_f16toi16: ; X86: # %bb.0: ; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax @@ -65,6 +131,25 @@ } define i32 @fptosi_f16toi32(half %x) #0 { +; SSE2-LABEL: fptosi_f16toi32: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: cvttss2si %xmm0, %eax +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fptosi_f16toi32: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vcvttss2si %xmm0, %eax +; AVX-NEXT: retq +; ; X86-LABEL: fptosi_f16toi32: ; X86: # %bb.0: ; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax @@ -80,6 +165,25 @@ } define i64 @fptosi_f16toi64(half %x) #0 { +; SSE2-LABEL: fptosi_f16toi64: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: cvttss2si %xmm0, %rax +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fptosi_f16toi64: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vcvttss2si %xmm0, %rax +; AVX-NEXT: retq +; ; X86-LABEL: fptosi_f16toi64: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 @@ -98,6 +202,27 @@ } define i1 @fptoui_f16toi1(half %x) #0 { +; SSE2-LABEL: fptoui_f16toi1: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: cvttss2si %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fptoui_f16toi1: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vcvttss2si %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; ; X86-LABEL: fptoui_f16toi1: ; X86: # %bb.0: ; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax @@ -115,6 +240,27 @@ } define i8 @fptoui_f16toi8(half %x) #0 { +; SSE2-LABEL: fptoui_f16toi8: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: cvttss2si %xmm0, %eax +; SSE2-NEXT: # kill: def $al killed $al killed $eax +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fptoui_f16toi8: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vcvttss2si %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq +; ; X86-LABEL: fptoui_f16toi8: ; X86: # %bb.0: ; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax @@ -132,6 +278,27 @@ } define i16 @fptoui_f16toi16(half %x) #0 { +; SSE2-LABEL: fptoui_f16toi16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: cvttss2si %xmm0, %eax +; SSE2-NEXT: # kill: def $ax killed $ax killed $eax +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fptoui_f16toi16: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vcvttss2si %xmm0, %eax +; AVX-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-NEXT: retq +; ; X86-LABEL: fptoui_f16toi16: ; X86: # %bb.0: ; X86-NEXT: vcvttsh2si {{[0-9]+}}(%esp), %eax @@ -149,6 +316,36 @@ } define i32 @fptoui_f16toi32(half %x) #0 { +; SSE2-LABEL: fptoui_f16toi32: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: cvttss2si %xmm0, %rax +; SSE2-NEXT: # kill: def $eax killed $eax killed $rax +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; F16C-LABEL: fptoui_f16toi32: +; F16C: # %bb.0: +; F16C-NEXT: vpextrw $0, %xmm0, %eax +; F16C-NEXT: movzwl %ax, %eax +; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vcvttss2si %xmm0, %rax +; F16C-NEXT: # kill: def $eax killed $eax killed $rax +; F16C-NEXT: retq +; +; AVX512-LABEL: fptoui_f16toi32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrw $0, %xmm0, %eax +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvttss2usi %xmm0, %eax +; AVX512-NEXT: retq +; ; X86-LABEL: fptoui_f16toi32: ; X86: # %bb.0: ; X86-NEXT: vcvttsh2usi {{[0-9]+}}(%esp), %eax @@ -164,6 +361,58 @@ } define i64 @fptoui_f16toi64(half %x) #0 { +; SSE2-LABEL: fptoui_f16toi64: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: comiss %xmm2, %xmm0 +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: jb .LBB9_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: movaps %xmm2, %xmm1 +; SSE2-NEXT: .LBB9_2: +; SSE2-NEXT: subss %xmm1, %xmm0 +; SSE2-NEXT: cvttss2si %xmm0, %rcx +; SSE2-NEXT: setae %al +; SSE2-NEXT: movzbl %al, %eax +; SSE2-NEXT: shlq $63, %rax +; SSE2-NEXT: xorq %rcx, %rax +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; F16C-LABEL: fptoui_f16toi64: +; F16C: # %bb.0: +; F16C-NEXT: vpextrw $0, %xmm0, %eax +; F16C-NEXT: movzwl %ax, %eax +; F16C-NEXT: vmovd %eax, %xmm0 +; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; F16C-NEXT: vcomiss %xmm1, %xmm0 +; F16C-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; F16C-NEXT: jb .LBB9_2 +; F16C-NEXT: # %bb.1: +; F16C-NEXT: vmovaps %xmm1, %xmm2 +; F16C-NEXT: .LBB9_2: +; F16C-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; F16C-NEXT: vcvttss2si %xmm0, %rcx +; F16C-NEXT: setae %al +; F16C-NEXT: movzbl %al, %eax +; F16C-NEXT: shlq $63, %rax +; F16C-NEXT: xorq %rcx, %rax +; F16C-NEXT: retq +; +; AVX512-LABEL: fptoui_f16toi64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpextrw $0, %xmm0, %eax +; AVX512-NEXT: movzwl %ax, %eax +; AVX512-NEXT: vmovd %eax, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vcvttss2usi %xmm0, %rax +; AVX512-NEXT: retq +; ; X86-LABEL: fptoui_f16toi64: ; X86: # %bb.0: ; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll @@ -1,4 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX,F16C +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX,AVX512 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64 @@ -14,6 +17,32 @@ declare half @llvm.experimental.constrained.uitofp.f16.i64(i64, metadata, metadata) define half @sitofp_i1tof16(i1 %x) #0 { +; SSE2-LABEL: sitofp_i1tof16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: andb $1, %dil +; SSE2-NEXT: negb %dil +; SSE2-NEXT: movsbl %dil, %eax +; SSE2-NEXT: cvtsi2ss %eax, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: sitofp_i1tof16: +; AVX: # %bb.0: +; AVX-NEXT: andb $1, %dil +; AVX-NEXT: negb %dil +; AVX-NEXT: movsbl %dil, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: sitofp_i1tof16: ; X86: # %bb.0: ; X86-NEXT: movb {{[0-9]+}}(%esp), %al @@ -37,6 +66,28 @@ } define half @sitofp_i8tof16(i8 %x) #0 { +; SSE2-LABEL: sitofp_i8tof16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: movsbl %dil, %eax +; SSE2-NEXT: cvtsi2ss %eax, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: sitofp_i8tof16: +; AVX: # %bb.0: +; AVX-NEXT: movsbl %dil, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: sitofp_i8tof16: ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax @@ -55,6 +106,28 @@ } define half @sitofp_i16tof16(i16 %x) #0 { +; SSE2-LABEL: sitofp_i16tof16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: movswl %di, %eax +; SSE2-NEXT: cvtsi2ss %eax, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: sitofp_i16tof16: +; AVX: # %bb.0: +; AVX-NEXT: movswl %di, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: sitofp_i16tof16: ; X86: # %bb.0: ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax @@ -73,6 +146,26 @@ } define half @sitofp_i32tof16(i32 %x) #0 { +; SSE2-LABEL: sitofp_i32tof16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: cvtsi2ss %edi, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: sitofp_i32tof16: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: sitofp_i32tof16: ; X86: # %bb.0: ; X86-NEXT: vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0 @@ -89,6 +182,26 @@ } define half @sitofp_i64tof16(i64 %x) #0 { +; SSE2-LABEL: sitofp_i64tof16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: cvtsi2ss %rdi, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: sitofp_i64tof16: +; AVX: # %bb.0: +; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: sitofp_i64tof16: ; X86: # %bb.0: ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero @@ -106,6 +219,28 @@ } define half @uitofp_i1tof16(i1 %x) #0 { +; SSE2-LABEL: uitofp_i1tof16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: andl $1, %edi +; SSE2-NEXT: cvtsi2ss %edi, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: uitofp_i1tof16: +; AVX: # %bb.0: +; AVX-NEXT: andl $1, %edi +; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: uitofp_i1tof16: ; X86: # %bb.0: ; X86-NEXT: movb {{[0-9]+}}(%esp), %al @@ -126,6 +261,28 @@ } define half @uitofp_i8tof16(i8 %x) #0 { +; SSE2-LABEL: uitofp_i8tof16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: cvtsi2ss %eax, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: uitofp_i8tof16: +; AVX: # %bb.0: +; AVX-NEXT: movzbl %dil, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: uitofp_i8tof16: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax @@ -144,6 +301,28 @@ } define half @uitofp_i16tof16(i16 %x) #0 { +; SSE2-LABEL: uitofp_i16tof16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: movzwl %di, %eax +; SSE2-NEXT: cvtsi2ss %eax, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: uitofp_i16tof16: +; AVX: # %bb.0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: uitofp_i16tof16: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax @@ -162,6 +341,38 @@ } define half @uitofp_i32tof16(i32 %x) #0 { +; SSE2-LABEL: uitofp_i32tof16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: cvtsi2ss %rax, %xmm0 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; F16C-LABEL: uitofp_i32tof16: +; F16C: # %bb.0: +; F16C-NEXT: movl %edi, %eax +; F16C-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; F16C-NEXT: retq +; +; AVX512-LABEL: uitofp_i32tof16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512-NEXT: retq +; ; X86-LABEL: uitofp_i32tof16: ; X86: # %bb.0: ; X86-NEXT: vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0 @@ -178,6 +389,58 @@ } define half @uitofp_i64tof16(i64 %x) #0 { +; SSE2-LABEL: uitofp_i64tof16: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: shrq %rax +; SSE2-NEXT: movl %edi, %ecx +; SSE2-NEXT: andl $1, %ecx +; SSE2-NEXT: orq %rax, %rcx +; SSE2-NEXT: testq %rdi, %rdi +; SSE2-NEXT: cmovnsq %rdi, %rcx +; SSE2-NEXT: cvtsi2ss %rcx, %xmm0 +; SSE2-NEXT: jns .LBB9_2 +; SSE2-NEXT: # %bb.1: +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB9_2: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; F16C-LABEL: uitofp_i64tof16: +; F16C: # %bb.0: +; F16C-NEXT: movq %rdi, %rax +; F16C-NEXT: shrq %rax +; F16C-NEXT: movl %edi, %ecx +; F16C-NEXT: andl $1, %ecx +; F16C-NEXT: orq %rax, %rcx +; F16C-NEXT: testq %rdi, %rdi +; F16C-NEXT: cmovnsq %rdi, %rcx +; F16C-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 +; F16C-NEXT: jns .LBB9_2 +; F16C-NEXT: # %bb.1: +; F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0 +; F16C-NEXT: .LBB9_2: +; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; F16C-NEXT: retq +; +; AVX512-LABEL: uitofp_i64tof16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX512-NEXT: retq +; ; X86-LABEL: uitofp_i64tof16: ; X86: # %bb.0: ; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll @@ -1,4 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+f16c -O3 | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -O3 | FileCheck %s --check-prefixes=AVX ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X64 @@ -11,6 +14,31 @@ declare half @llvm.experimental.constrained.round.f16(half, metadata) define half @fceil32(half %f) #0 { +; SSE2-LABEL: fceil32: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: callq ceilf@PLT +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fceil32: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: fceil32: ; X86: # %bb.0: ; X86-NEXT: vrndscalesh $10, {{[0-9]+}}(%esp), %xmm0, %xmm0 @@ -26,6 +54,31 @@ } define half @ffloor32(half %f) #0 { +; SSE2-LABEL: ffloor32: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: callq floorf@PLT +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: ffloor32: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: ffloor32: ; X86: # %bb.0: ; X86-NEXT: vrndscalesh $9, {{[0-9]+}}(%esp), %xmm0, %xmm0 @@ -41,6 +94,31 @@ } define half @ftrunc32(half %f) #0 { +; SSE2-LABEL: ftrunc32: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: callq truncf@PLT +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: ftrunc32: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: ftrunc32: ; X86: # %bb.0: ; X86-NEXT: vrndscalesh $11, {{[0-9]+}}(%esp), %xmm0, %xmm0 @@ -56,6 +134,31 @@ } define half @frint32(half %f) #0 { +; SSE2-LABEL: frint32: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: callq rintf@PLT +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: frint32: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: frint32: ; X86: # %bb.0: ; X86-NEXT: vrndscalesh $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 @@ -72,6 +175,31 @@ } define half @fnearbyint32(half %f) #0 { +; SSE2-LABEL: fnearbyint32: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: callq nearbyintf@PLT +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fnearbyint32: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: fnearbyint32: ; X86: # %bb.0: ; X86-NEXT: vrndscalesh $12, {{[0-9]+}}(%esp), %xmm0, %xmm0 @@ -88,6 +216,31 @@ } define half @froundeven16(half %f) #0 { +; SSE2-LABEL: froundeven16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: callq roundevenf@PLT +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: froundeven16: +; AVX: # %bb.0: +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: retq +; ; X86-LABEL: froundeven16: ; X86: # %bb.0: ; X86-NEXT: vrndscalesh $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 @@ -104,6 +257,35 @@ } define half @fround16(half %f) #0 { +; SSE2-LABEL: fround16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: callq __extendhfsf2@PLT +; SSE2-NEXT: callq roundf@PLT +; SSE2-NEXT: callq __truncsfhf2@PLT +; SSE2-NEXT: popq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 8 +; SSE2-NEXT: retq +; +; AVX-LABEL: fround16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: vpextrw $0, %xmm0, %eax +; AVX-NEXT: movzwl %ax, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX-NEXT: callq roundf@PLT +; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVX-NEXT: popq %rax +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq +; ; X86-LABEL: fround16: ; X86: # %bb.0: ; X86-NEXT: subl $8, %esp