diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -22910,33 +22910,14 @@ SDLoc DL(Op); SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); - SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1); MVT VT = Op.getSimpleValueType(); MVT SVT = In.getSimpleValueType(); if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80)) return SDValue(); - if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT)) { - if (Subtarget.hasFP16()) - return Op; - - if (SVT.getScalarType() != MVT::f32) { - MVT TmpVT = - VT.isVector() ? SVT.changeVectorElementType(MVT::f32) : MVT::f32; - if (IsStrict) - return DAG.getNode( - ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other}, - {Chain, - DAG.getNode(ISD::STRICT_FP_ROUND, DL, {TmpVT, MVT::Other}, - {Chain, In, Op2}), - Op2}); - - return DAG.getNode(ISD::FP_ROUND, DL, VT, - DAG.getNode(ISD::FP_ROUND, DL, TmpVT, In, Op2), Op2); - } - - if (!Subtarget.hasF16C()) + if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) { + if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32) return SDValue(); if (VT.isVector()) @@ -32977,19 +32958,8 @@ } if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) { assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C"); - if (SrcVT == MVT::v2f64) { - if (IsStrict) - Src = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, - {MVT::v4f32, MVT::Other}, {Chain, Src}); - else - Src = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Src); - } else if (SrcVT == MVT::v4f64) { - if (IsStrict) - Src = DAG.getNode(ISD::STRICT_FP_ROUND, dl, {MVT::v4f32, MVT::Other}, - {Chain, Src, Rnd}); - else - Src = DAG.getNode(ISD::FP_ROUND, dl, MVT::v4f32, Src, Rnd); - } + if (SrcVT.getVectorElementType() != MVT::f32) + return; if (IsStrict) V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other}, diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll --- a/llvm/test/CodeGen/X86/cvt16.ll +++ b/llvm/test/CodeGen/X86/cvt16.ll @@ -21,17 +21,14 @@ ; vcvtps2ph instructions -define void @test1(float %src, ptr %dest) { +define void @test1(float %src, ptr %dest) nounwind { ; LIBCALL-LABEL: test1: ; LIBCALL: # %bb.0: ; LIBCALL-NEXT: pushq %rbx -; LIBCALL-NEXT: .cfi_def_cfa_offset 16 -; LIBCALL-NEXT: .cfi_offset %rbx, -16 ; LIBCALL-NEXT: movq %rdi, %rbx ; LIBCALL-NEXT: callq __truncsfhf2@PLT ; LIBCALL-NEXT: pextrw $0, %xmm0, (%rbx) ; LIBCALL-NEXT: popq %rbx -; LIBCALL-NEXT: .cfi_def_cfa_offset 8 ; LIBCALL-NEXT: retq ; ; F16C-LABEL: test1: @@ -44,20 +41,17 @@ ; SOFTFLOAT-LABEL: test1: ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rbx -; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 16 -; SOFTFLOAT-NEXT: .cfi_offset %rbx, -16 ; SOFTFLOAT-NEXT: movq %rsi, %rbx ; SOFTFLOAT-NEXT: callq __gnu_f2h_ieee@PLT ; SOFTFLOAT-NEXT: movw %ax, (%rbx) ; SOFTFLOAT-NEXT: popq %rbx -; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 8 ; SOFTFLOAT-NEXT: retq %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src) store i16 %1, ptr %dest, align 2 ret void } -define float @test2(ptr nocapture %src) { +define float @test2(ptr nocapture %src) nounwind { ; LIBCALL-LABEL: test2: ; LIBCALL: # %bb.0: ; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 @@ -73,11 +67,9 @@ ; SOFTFLOAT-LABEL: test2: ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax -; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 16 ; SOFTFLOAT-NEXT: movzwl (%rdi), %edi ; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT ; SOFTFLOAT-NEXT: popq %rcx -; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 8 ; SOFTFLOAT-NEXT: retq %1 = load i16, ptr %src, align 2 %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1) @@ -118,16 +110,14 @@ ret float %2 } -define double @test4(ptr nocapture %src) { +define double @test4(ptr nocapture %src) nounwind { ; LIBCALL-LABEL: test4: ; LIBCALL: # %bb.0: ; LIBCALL-NEXT: pushq %rax -; LIBCALL-NEXT: .cfi_def_cfa_offset 16 ; LIBCALL-NEXT: pinsrw $0, (%rdi), %xmm0 ; LIBCALL-NEXT: callq __extendhfsf2@PLT ; LIBCALL-NEXT: cvtss2sd %xmm0, %xmm0 ; LIBCALL-NEXT: popq %rax -; LIBCALL-NEXT: .cfi_def_cfa_offset 8 ; LIBCALL-NEXT: retq ; ; F16C-LABEL: test4: @@ -141,46 +131,41 @@ ; SOFTFLOAT-LABEL: test4: ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax -; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 16 ; SOFTFLOAT-NEXT: movzwl (%rdi), %edi ; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT ; SOFTFLOAT-NEXT: movl %eax, %edi ; SOFTFLOAT-NEXT: callq __extendsfdf2@PLT ; SOFTFLOAT-NEXT: popq %rcx -; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 8 ; SOFTFLOAT-NEXT: retq %1 = load i16, ptr %src, align 2 %2 = tail call double @llvm.convert.from.fp16.f64(i16 %1) ret double %2 } -define i16 @test5(double %src) { +define i16 @test5(double %src) nounwind { ; LIBCALL-LABEL: test5: ; LIBCALL: # %bb.0: ; LIBCALL-NEXT: pushq %rax -; LIBCALL-NEXT: .cfi_def_cfa_offset 16 ; LIBCALL-NEXT: callq __truncdfhf2@PLT ; LIBCALL-NEXT: pextrw $0, %xmm0, %eax ; LIBCALL-NEXT: # kill: def $ax killed $ax killed $eax ; LIBCALL-NEXT: popq %rcx -; LIBCALL-NEXT: .cfi_def_cfa_offset 8 ; LIBCALL-NEXT: retq ; ; F16C-LABEL: test5: ; F16C: # %bb.0: -; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: pushq %rax +; F16C-NEXT: callq __truncdfhf2@PLT +; F16C-NEXT: vpextrw $0, %xmm0, %eax ; F16C-NEXT: # kill: def $ax killed $ax killed $eax +; F16C-NEXT: popq %rcx ; F16C-NEXT: retq ; ; SOFTFLOAT-LABEL: test5: ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax -; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 16 ; SOFTFLOAT-NEXT: callq __truncdfhf2@PLT ; SOFTFLOAT-NEXT: popq %rcx -; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 8 ; SOFTFLOAT-NEXT: retq %val = tail call i16 @llvm.convert.to.fp16.f64(double %src) ret i16 %val diff --git a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll --- a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll +++ b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll @@ -1,26 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=ALL --check-prefix=F16C -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+f16c < %s | FileCheck %s --check-prefix=ALL +; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=ALL define zeroext i16 @test1_fast(double %d) #0 { -; F16C-LABEL: test1_fast: -; F16C: # %bb.0: # %entry -; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-NEXT: retq -; -; AVX-LABEL: test1_fast: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq __truncdfhf2@PLT -; AVX-NEXT: vpextrw $0, %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: popq %rcx -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; ALL-LABEL: test1_fast: +; ALL: # %bb.0: # %entry +; ALL-NEXT: pushq %rax +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vpextrw $0, %xmm0, %eax +; ALL-NEXT: # kill: def $ax killed $ax killed $eax +; ALL-NEXT: popq %rcx +; ALL-NEXT: retq entry: %0 = tail call i16 @llvm.convert.to.fp16.f64(double %d) ret i16 %0 @@ -30,14 +20,12 @@ ; ALL-LABEL: test2_fast: ; ALL: # %bb.0: # %entry ; ALL-NEXT: subq $24, %rsp -; ALL-NEXT: .cfi_def_cfa_offset 32 ; ALL-NEXT: fldt {{[0-9]+}}(%rsp) ; ALL-NEXT: fstpt (%rsp) ; ALL-NEXT: callq __truncxfhf2@PLT ; ALL-NEXT: vpextrw $0, %xmm0, %eax ; ALL-NEXT: # kill: def $ax killed $ax killed $eax ; ALL-NEXT: addq $24, %rsp -; ALL-NEXT: .cfi_def_cfa_offset 8 ; ALL-NEXT: retq entry: %0 = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %d) @@ -45,24 +33,14 @@ } define zeroext i16 @test1(double %d) #1 { -; F16C-LABEL: test1: -; F16C: # %bb.0: # %entry -; F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax -; F16C-NEXT: # kill: def $ax killed $ax killed $eax -; F16C-NEXT: retq -; -; AVX-LABEL: test1: -; AVX: # %bb.0: # %entry -; AVX-NEXT: pushq %rax -; AVX-NEXT: .cfi_def_cfa_offset 16 -; AVX-NEXT: callq __truncdfhf2@PLT -; AVX-NEXT: vpextrw $0, %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: popq %rcx -; AVX-NEXT: .cfi_def_cfa_offset 8 -; AVX-NEXT: retq +; ALL-LABEL: test1: +; ALL: # %bb.0: # %entry +; ALL-NEXT: pushq %rax +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vpextrw $0, %xmm0, %eax +; ALL-NEXT: # kill: def $ax killed $ax killed $eax +; ALL-NEXT: popq %rcx +; ALL-NEXT: retq entry: %0 = tail call i16 @llvm.convert.to.fp16.f64(double %d) ret i16 %0 @@ -72,14 +50,12 @@ ; ALL-LABEL: test2: ; ALL: # %bb.0: # %entry ; ALL-NEXT: subq $24, %rsp -; ALL-NEXT: .cfi_def_cfa_offset 32 ; ALL-NEXT: fldt {{[0-9]+}}(%rsp) ; ALL-NEXT: fstpt (%rsp) ; ALL-NEXT: callq __truncxfhf2@PLT ; ALL-NEXT: vpextrw $0, %xmm0, %eax ; ALL-NEXT: # kill: def $ax killed $ax killed $eax ; ALL-NEXT: addq $24, %rsp -; ALL-NEXT: .cfi_def_cfa_offset 8 ; ALL-NEXT: retq entry: %0 = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %d) @@ -89,5 +65,5 @@ declare i16 @llvm.convert.to.fp16.f64(double) declare i16 @llvm.convert.to.fp16.f80(x86_fp80) -attributes #0 = { nounwind readnone uwtable "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { nounwind readnone uwtable "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { nounwind readnone "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind readnone "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll @@ -357,13 +357,12 @@ ; ; AVX-LABEL: fptrunc_double_to_f16: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: movw %ax, (%rsi) +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rsi, %rbx +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVX-NEXT: popq %rbx ; AVX-NEXT: retq ; ; X86-LABEL: fptrunc_double_to_f16: diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll --- a/llvm/test/CodeGen/X86/half-constrained.ll +++ b/llvm/test/CodeGen/X86/half-constrained.ll @@ -201,13 +201,13 @@ ; ; X32-F16C-LABEL: double_to_half: ; X32-F16C: ## %bb.0: -; X32-F16C-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; X32-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X32-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X32-F16C-NEXT: vmovd %xmm0, %eax -; X32-F16C-NEXT: movw %ax, _a +; X32-F16C-NEXT: subl $12, %esp +; X32-F16C-NEXT: .cfi_def_cfa_offset 16 +; X32-F16C-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X32-F16C-NEXT: vmovq %xmm0, (%esp) +; X32-F16C-NEXT: calll ___truncdfhf2 +; X32-F16C-NEXT: vpextrw $0, %xmm0, _a +; X32-F16C-NEXT: addl $12, %esp ; X32-F16C-NEXT: retl ; ; X64-NOF16C-LABEL: double_to_half: @@ -222,12 +222,11 @@ ; ; X64-F16C-LABEL: double_to_half: ; X64-F16C: ## %bb.0: -; X64-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; X64-F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X64-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; X64-F16C-NEXT: vmovd %xmm0, %eax -; X64-F16C-NEXT: movw %ax, _a(%rip) +; X64-F16C-NEXT: pushq %rax +; X64-F16C-NEXT: .cfi_def_cfa_offset 16 +; X64-F16C-NEXT: callq ___truncdfhf2 +; X64-F16C-NEXT: vpextrw $0, %xmm0, _a(%rip) +; X64-F16C-NEXT: popq %rax ; X64-F16C-NEXT: retq %2 = tail call half @llvm.experimental.constrained.fptrunc.f16.f64(double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 store half %2, ptr @a, align 2 diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -183,10 +183,11 @@ ; ; BWON-F16C-LABEL: test_trunc64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; BWON-F16C-NEXT: vmovd %xmm0, %eax -; BWON-F16C-NEXT: movw %ax, (%rdi) +; BWON-F16C-NEXT: pushq %rbx +; BWON-F16C-NEXT: movq %rdi, %rbx +; BWON-F16C-NEXT: callq __truncdfhf2@PLT +; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rbx) +; BWON-F16C-NEXT: popq %rbx ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_trunc64: @@ -681,9 +682,36 @@ ; ; BWON-F16C-LABEL: test_trunc64_vec4: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vcvtpd2ps %ymm0, %xmm0 -; BWON-F16C-NEXT: vcvtps2ph $0, %xmm0, (%rdi) +; BWON-F16C-NEXT: pushq %rbx +; BWON-F16C-NEXT: subq $64, %rsp +; BWON-F16C-NEXT: movq %rdi, %rbx +; BWON-F16C-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; BWON-F16C-NEXT: vextractf128 $1, %ymm0, %xmm0 +; BWON-F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; BWON-F16C-NEXT: vzeroupper +; BWON-F16C-NEXT: callq __truncdfhf2@PLT +; BWON-F16C-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BWON-F16C-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; BWON-F16C-NEXT: # xmm0 = mem[1,0] +; BWON-F16C-NEXT: callq __truncdfhf2@PLT +; BWON-F16C-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; BWON-F16C-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BWON-F16C-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; BWON-F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; BWON-F16C-NEXT: vzeroupper +; BWON-F16C-NEXT: callq __truncdfhf2@PLT +; BWON-F16C-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; BWON-F16C-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; BWON-F16C-NEXT: # xmm0 = mem[1,0] +; BWON-F16C-NEXT: callq __truncdfhf2@PLT +; BWON-F16C-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; BWON-F16C-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; BWON-F16C-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; BWON-F16C-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; BWON-F16C-NEXT: vmovq %xmm0, (%rbx) +; BWON-F16C-NEXT: addq $64, %rsp +; BWON-F16C-NEXT: popq %rbx ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: test_trunc64_vec4: diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX1 -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX2 -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX2 -; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX2 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX512 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX512 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX512 @@ -55,21 +55,13 @@ } define <16 x float> @cvt_16i16_to_16f32(<16 x i16> %a0) nounwind { -; AVX1-LABEL: cvt_16i16_to_16f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvtph2ps %xmm0, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vcvtph2ps %xmm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_16i16_to_16f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvtph2ps %xmm0, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %ymm1 -; AVX2-NEXT: vmovaps %ymm2, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: cvt_16i16_to_16f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtph2ps %xmm0, %ymm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vcvtph2ps %xmm0, %ymm1 +; AVX-NEXT: vmovaps %ymm2, %ymm0 +; AVX-NEXT: retq ; ; AVX512-LABEL: cvt_16i16_to_16f32: ; AVX512: # %bb.0: @@ -115,19 +107,12 @@ declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata) strictfp define <16 x float> @cvt_16i16_to_16f32_constrained(<16 x i16> %a0) nounwind strictfp { -; AVX1-LABEL: cvt_16i16_to_16f32_constrained: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vcvtph2ps %xmm1, %ymm1 -; AVX1-NEXT: vcvtph2ps %xmm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_16i16_to_16f32_constrained: -; AVX2: # %bb.0: -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %ymm1 -; AVX2-NEXT: vcvtph2ps %xmm0, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: cvt_16i16_to_16f32_constrained: +; AVX: # %bb.0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vcvtph2ps %xmm1, %ymm1 +; AVX-NEXT: vcvtph2ps %xmm0, %ymm0 +; AVX-NEXT: retq ; ; AVX512-LABEL: cvt_16i16_to_16f32_constrained: ; AVX512: # %bb.0: @@ -191,17 +176,11 @@ } define <16 x float> @load_cvt_16i16_to_16f32(ptr %a0) nounwind { -; AVX1-LABEL: load_cvt_16i16_to_16f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvtph2ps (%rdi), %ymm0 -; AVX1-NEXT: vcvtph2ps 16(%rdi), %ymm1 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_cvt_16i16_to_16f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvtph2ps (%rdi), %ymm0 -; AVX2-NEXT: vcvtph2ps 16(%rdi), %ymm1 -; AVX2-NEXT: retq +; AVX-LABEL: load_cvt_16i16_to_16f32: +; AVX: # %bb.0: +; AVX-NEXT: vcvtph2ps (%rdi), %ymm0 +; AVX-NEXT: vcvtph2ps 16(%rdi), %ymm1 +; AVX-NEXT: retq ; ; AVX512-LABEL: load_cvt_16i16_to_16f32: ; AVX512: # %bb.0: @@ -302,21 +281,13 @@ } define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind { -; AVX1-LABEL: cvt_8i16_to_8f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvtph2ps %xmm0, %ymm1 -; AVX1-NEXT: vcvtps2pd %xmm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vcvtps2pd %xmm1, %ymm1 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_8i16_to_8f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvtph2ps %xmm0, %ymm1 -; AVX2-NEXT: vcvtps2pd %xmm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1 -; AVX2-NEXT: retq +; AVX-LABEL: cvt_8i16_to_8f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtph2ps %xmm0, %ymm1 +; AVX-NEXT: vcvtps2pd %xmm1, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vcvtps2pd %xmm1, %ymm1 +; AVX-NEXT: retq ; ; AVX512-LABEL: cvt_8i16_to_8f64: ; AVX512: # %bb.0: @@ -354,21 +325,13 @@ declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata) strictfp define <8 x double> @cvt_8i16_to_8f64_constrained(<8 x i16> %a0) nounwind strictfp { -; AVX1-LABEL: cvt_8i16_to_8f64_constrained: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvtph2ps %xmm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vcvtps2pd %xmm1, %ymm1 -; AVX1-NEXT: vcvtps2pd %xmm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_8i16_to_8f64_constrained: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvtph2ps %xmm0, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1 -; AVX2-NEXT: vcvtps2pd %xmm0, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: cvt_8i16_to_8f64_constrained: +; AVX: # %bb.0: +; AVX-NEXT: vcvtph2ps %xmm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vcvtps2pd %xmm1, %ymm1 +; AVX-NEXT: vcvtps2pd %xmm0, %ymm0 +; AVX-NEXT: retq ; ; AVX512-LABEL: cvt_8i16_to_8f64_constrained: ; AVX512: # %bb.0: @@ -439,21 +402,13 @@ } define <8 x double> @load_cvt_8i16_to_8f64(ptr %a0) nounwind { -; AVX1-LABEL: load_cvt_8i16_to_8f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvtph2ps (%rdi), %ymm1 -; AVX1-NEXT: vcvtps2pd %xmm1, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vcvtps2pd %xmm1, %ymm1 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_cvt_8i16_to_8f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvtph2ps (%rdi), %ymm1 -; AVX2-NEXT: vcvtps2pd %xmm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-NEXT: vcvtps2pd %xmm1, %ymm1 -; AVX2-NEXT: retq +; AVX-LABEL: load_cvt_8i16_to_8f64: +; AVX: # %bb.0: +; AVX-NEXT: vcvtph2ps (%rdi), %ymm1 +; AVX-NEXT: vcvtps2pd %xmm1, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX-NEXT: vcvtps2pd %xmm1, %ymm1 +; AVX-NEXT: retq ; ; AVX512-LABEL: load_cvt_8i16_to_8f64: ; AVX512: # %bb.0: @@ -526,19 +481,12 @@ } define <16 x i16> @cvt_16f32_to_16i16(<16 x float> %a0) nounwind { -; AVX1-LABEL: cvt_16f32_to_16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX1-NEXT: vcvtps2ph $4, %ymm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_16f32_to_16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvtps2ph $4, %ymm0, %xmm0 -; AVX2-NEXT: vcvtps2ph $4, %ymm1, %xmm1 -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: cvt_16f32_to_16i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX-NEXT: vcvtps2ph $4, %ymm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: retq ; ; AVX512-LABEL: cvt_16f32_to_16i16: ; AVX512: # %bb.0: @@ -616,19 +564,12 @@ } define void @store_cvt_16f32_to_16i16(<16 x float> %a0, ptr %a1) nounwind { -; AVX1-LABEL: store_cvt_16f32_to_16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) -; AVX1-NEXT: vcvtps2ph $4, %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_16f32_to_16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) -; AVX2-NEXT: vcvtps2ph $4, %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: store_cvt_16f32_to_16i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) +; AVX-NEXT: vcvtps2ph $4, %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512-LABEL: store_cvt_16f32_to_16i16: ; AVX512: # %bb.0: @@ -648,10 +589,11 @@ define i16 @cvt_f64_to_i16(double %a0) nounwind { ; ALL-LABEL: cvt_f64_to_i16: ; ALL: # %bb.0: -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: pushq %rax +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vpextrw $0, %xmm0, %eax ; ALL-NEXT: # kill: def $ax killed $ax killed $eax +; ALL-NEXT: popq %rcx ; ALL-NEXT: retq %1 = fptrunc double %a0 to half %2 = bitcast half %1 to i16 @@ -659,35 +601,159 @@ } define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { -; ALL-LABEL: cvt_2f64_to_2i16: -; ALL: # %bb.0: -; ALL-NEXT: vcvtpd2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; ALL-NEXT: retq +; AVX-LABEL: cvt_2f64_to_2i16: +; AVX: # %bb.0: +; AVX-NEXT: subq $40, %rsp +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> ret <2 x i16> %2 } define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_4i16: -; ALL: # %bb.0: -; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; AVX-LABEL: cvt_4f64_to_4i16: +; AVX: # %bb.0: +; AVX-NEXT: subq $72, %rsp +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero +; AVX-NEXT: addq $72, %rsp +; AVX-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_4i16: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $72, %rsp +; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX512-NEXT: addq $72, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> ret <4 x i16> %2 } define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind { -; ALL-LABEL: cvt_4f64_to_8i16_undef: -; ALL: # %bb.0: -; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; AVX-LABEL: cvt_4f64_to_8i16_undef: +; AVX: # %bb.0: +; AVX-NEXT: subq $72, %rsp +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero +; AVX-NEXT: addq $72, %rsp +; AVX-NEXT: retq +; +; AVX512-LABEL: cvt_4f64_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $72, %rsp +; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX512-NEXT: addq $72, %rsp +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> @@ -697,9 +763,32 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind { ; ALL-LABEL: cvt_4f64_to_8i16_zero: ; ALL: # %bb.0: -; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 +; ALL-NEXT: subq $72, %rsp +; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; ALL-NEXT: vzeroupper +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; ALL-NEXT: # xmm0 = mem[1,0] +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; ALL-NEXT: # xmm0 = mem[1,0] +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; ALL-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; ALL-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero +; ALL-NEXT: addq $72, %rsp ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -708,31 +797,120 @@ } define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind { -; AVX1-LABEL: cvt_8f64_to_8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvtpd2ps %ymm1, %xmm1 -; AVX1-NEXT: vcvtps2ph $0, %xmm1, %xmm1 -; AVX1-NEXT: vcvtpd2ps %ymm0, %xmm0 -; AVX1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: cvt_8f64_to_8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvtpd2ps %ymm1, %xmm1 -; AVX2-NEXT: vcvtps2ph $0, %xmm1, %xmm1 -; AVX2-NEXT: vcvtpd2ps %ymm0, %xmm0 -; AVX2-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: cvt_8f64_to_8i16: +; AVX: # %bb.0: +; AVX-NEXT: subq $104, %rsp +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: addq $104, %rsp +; AVX-NEXT: retq ; ; AVX512-LABEL: cvt_8f64_to_8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtpd2ps %zmm0, %ymm0 -; AVX512-NEXT: vcvtps2ph $4, %ymm0, %xmm0 +; AVX512-NEXT: subq $120, %rsp +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $120, %rsp ; AVX512-NEXT: retq %1 = fptrunc <8 x double> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> @@ -746,10 +924,11 @@ define void @store_cvt_f64_to_i16(double %a0, ptr %a1) nounwind { ; ALL-LABEL: store_cvt_f64_to_i16: ; ALL: # %bb.0: -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: movw %ax, (%rdi) +; ALL-NEXT: pushq %rbx +; ALL-NEXT: movq %rdi, %rbx +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vpextrw $0, %xmm0, (%rbx) +; ALL-NEXT: popq %rbx ; ALL-NEXT: retq %1 = fptrunc double %a0 to half %2 = bitcast half %1 to i16 @@ -760,9 +939,20 @@ define void @store_cvt_2f64_to_2i16(<2 x double> %a0, ptr %a1) nounwind { ; ALL-LABEL: store_cvt_2f64_to_2i16: ; ALL: # %bb.0: -; ALL-NEXT: vcvtpd2ps %xmm0, %xmm0 -; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; ALL-NEXT: vmovss %xmm0, (%rdi) +; ALL-NEXT: pushq %rbx +; ALL-NEXT: subq $32, %rsp +; ALL-NEXT: movq %rdi, %rbx +; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; ALL-NEXT: # xmm0 = mem[1,0] +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; ALL-NEXT: vmovd %xmm0, (%rbx) +; ALL-NEXT: addq $32, %rsp +; ALL-NEXT: popq %rbx ; ALL-NEXT: retq %1 = fptrunc <2 x double> %a0 to <2 x half> %2 = bitcast <2 x half> %1 to <2 x i16> @@ -773,9 +963,36 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind { ; ALL-LABEL: store_cvt_4f64_to_4i16: ; ALL: # %bb.0: -; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; ALL-NEXT: vcvtps2ph $0, %xmm0, (%rdi) +; ALL-NEXT: pushq %rbx +; ALL-NEXT: subq $64, %rsp +; ALL-NEXT: movq %rdi, %rbx +; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; ALL-NEXT: vzeroupper +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; ALL-NEXT: # xmm0 = mem[1,0] +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; ALL-NEXT: vzeroupper +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; ALL-NEXT: # xmm0 = mem[1,0] +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; ALL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; ALL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; ALL-NEXT: vmovq %xmm0, (%rbx) +; ALL-NEXT: addq $64, %rsp +; ALL-NEXT: popq %rbx ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -784,13 +1001,78 @@ } define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind { -; ALL-LABEL: store_cvt_4f64_to_8i16_undef: -; ALL: # %bb.0: -; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq +; AVX-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $64, %rsp +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero +; AVX-NEXT: vmovaps %xmm0, (%rbx) +; AVX-NEXT: addq $64, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq +; +; AVX512-LABEL: store_cvt_4f64_to_8i16_undef: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $64, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX512-NEXT: vmovaps %xmm0, (%rbx) +; AVX512-NEXT: addq $64, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> @@ -801,10 +1083,36 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind { ; ALL-LABEL: store_cvt_4f64_to_8i16_zero: ; ALL: # %bb.0: -; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; ALL-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; ALL-NEXT: vmovaps %xmm0, (%rdi) +; ALL-NEXT: pushq %rbx +; ALL-NEXT: subq $64, %rsp +; ALL-NEXT: movq %rdi, %rbx +; ALL-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; ALL-NEXT: vzeroupper +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; ALL-NEXT: # xmm0 = mem[1,0] +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; ALL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; ALL-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; ALL-NEXT: vzeroupper +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; ALL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; ALL-NEXT: # xmm0 = mem[1,0] +; ALL-NEXT: callq __truncdfhf2@PLT +; ALL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; ALL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; ALL-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; ALL-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero +; ALL-NEXT: vmovaps %xmm0, (%rbx) +; ALL-NEXT: addq $64, %rsp +; ALL-NEXT: popq %rbx ; ALL-NEXT: retq %1 = fptrunc <4 x double> %a0 to <4 x half> %2 = bitcast <4 x half> %1 to <4 x i16> @@ -814,33 +1122,128 @@ } define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind { -; AVX1-LABEL: store_cvt_8f64_to_8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvtpd2ps %ymm1, %xmm1 -; AVX1-NEXT: vcvtps2ph $0, %xmm1, %xmm1 -; AVX1-NEXT: vcvtpd2ps %ymm0, %xmm0 -; AVX1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovaps %xmm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_8f64_to_8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvtpd2ps %ymm1, %xmm1 -; AVX2-NEXT: vcvtps2ph $0, %xmm1, %xmm1 -; AVX2-NEXT: vcvtpd2ps %ymm0, %xmm0 -; AVX2-NEXT: vcvtps2ph $0, %xmm0, %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovaps %xmm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: store_cvt_8f64_to_8i16: +; AVX: # %bb.0: +; AVX-NEXT: pushq %rbx +; AVX-NEXT: subq $96, %rsp +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX-NEXT: vzeroupper +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[1,0] +; AVX-NEXT: callq __truncdfhf2@PLT +; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX-NEXT: vmovdqa %xmm0, (%rbx) +; AVX-NEXT: addq $96, %rsp +; AVX-NEXT: popq %rbx +; AVX-NEXT: retq ; ; AVX512-LABEL: store_cvt_8f64_to_8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtpd2ps %zmm0, %ymm0 -; AVX512-NEXT: vcvtps2ph $4, %ymm0, (%rdi) +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $112, %rsp +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __truncdfhf2@PLT +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: vmovdqa %xmm0, (%rbx) +; AVX512-NEXT: addq $112, %rsp +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %1 = fptrunc <8 x double> %a0 to <8 x half> %2 = bitcast <8 x half> %1 to <8 x i16> @@ -849,23 +1252,14 @@ } define void @store_cvt_32f32_to_32f16(<32 x float> %a0, ptr %a1) nounwind { -; AVX1-LABEL: store_cvt_32f32_to_32f16: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvtps2ph $4, %ymm3, 48(%rdi) -; AVX1-NEXT: vcvtps2ph $4, %ymm2, 32(%rdi) -; AVX1-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) -; AVX1-NEXT: vcvtps2ph $4, %ymm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_cvt_32f32_to_32f16: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvtps2ph $4, %ymm3, 48(%rdi) -; AVX2-NEXT: vcvtps2ph $4, %ymm2, 32(%rdi) -; AVX2-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) -; AVX2-NEXT: vcvtps2ph $4, %ymm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: store_cvt_32f32_to_32f16: +; AVX: # %bb.0: +; AVX-NEXT: vcvtps2ph $4, %ymm3, 48(%rdi) +; AVX-NEXT: vcvtps2ph $4, %ymm2, 32(%rdi) +; AVX-NEXT: vcvtps2ph $4, %ymm1, 16(%rdi) +; AVX-NEXT: vcvtps2ph $4, %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq ; ; AVX512-LABEL: store_cvt_32f32_to_32f16: ; AVX512: # %bb.0: