diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1951,6 +1951,10 @@ setOperationAction(ISD::SETCC, MVT::f16, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); + setOperationAction(ISD::FROUND, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FROUND, MVT::f16, Custom); + setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); @@ -22496,6 +22500,10 @@ /// compiling with trapping math, we can emulate this with /// floor(X + copysign(nextafter(0.5, 0.0), X)). static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) { + if (Op.getOpcode() == ISD::STRICT_FROUND && + Op.getSimpleValueType() == MVT::f16) + report_fatal_error("For now cannot emit strict round(fp16) at backend for " + "lacking library support."); SDValue N0 = Op.getOperand(0); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); @@ -31064,6 +31072,7 @@ case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: case ISD::FSUB: return lowerFaddFsub(Op, DAG); + case ISD::STRICT_FROUND: case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -1,8 +1,79 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512FP16 + +define half @round_f16(half %h) { +; SSE2-LABEL: round_f16: +; SSE2: ## %bb.0: ## %entry +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: movzwl %di, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: retq +; +; SSE41-LABEL: round_f16: +; SSE41: ## %bb.0: ## %entry +; SSE41-NEXT: pushq %rax +; SSE41-NEXT: .cfi_def_cfa_offset 16 +; SSE41-NEXT: movzwl %di, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: popq %rcx +; SSE41-NEXT: retq +; +; AVX1-LABEL: round_f16: +; AVX1: ## %bb.0: ## %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: movzwl %di, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: popq %rcx +; AVX1-NEXT: retq +; +; AVX512F-LABEL: round_f16: +; AVX512F: ## %bb.0: ## %entry +; AVX512F-NEXT: movzwl %di, %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512F-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: ## kill: def $ax killed $ax killed $eax +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: round_f16: +; AVX512FP16: ## %bb.0: ## %entry +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] +; AVX512FP16-NEXT: vpternlogq $248, %xmm1, %xmm0, %xmm2 +; AVX512FP16-NEXT: vaddsh %xmm2, %xmm0, %xmm0 +; AVX512FP16-NEXT: vrndscalesh $11, %xmm0, %xmm0, %xmm0 +; AVX512FP16-NEXT: retq +entry: + %a = call half @llvm.round.f16(half %h) + ret half %a +} define float @round_f32(float %x) { ; SSE2-LABEL: round_f32: @@ -30,10 +101,8 @@ ; ; AVX512-LABEL: round_f32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] -; AVX512-NEXT: vorps %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -56,15 +125,23 @@ ; SSE41-NEXT: roundsd $11, %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: round_f64: -; AVX: ## %bb.0: -; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] -; AVX-NEXT: ## xmm2 = mem[0,0] -; AVX-NEXT: vorpd %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: round_f64: +; AVX1: ## %bb.0: +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] +; AVX1-NEXT: ## xmm2 = mem[0,0] +; AVX1-NEXT: vorpd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: round_f64: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1] +; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq %a = call double @llvm.round.f64(double %x) ret double %a } @@ -117,10 +194,8 @@ ; ; AVX512-LABEL: round_v4f32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] -; AVX512-NEXT: vorps %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vroundps $11, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -154,13 +229,21 @@ ; SSE41-NEXT: roundpd $11, %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: round_v2f64: -; AVX: ## %bb.0: -; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vroundpd $11, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: round_v2f64: +; AVX1: ## %bb.0: +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundpd $11, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: round_v2f64: +; AVX512: ## %bb.0: +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4.9999999999999994E-1,4.9999999999999994E-1] +; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vroundpd $11, %xmm0, %xmm0 +; AVX512-NEXT: retq %a = call <2 x double> @llvm.round.v2f64(<2 x double> %x) ret <2 x double> %a } @@ -241,10 +324,8 @@ ; ; AVX512-LABEL: round_v8f32: ; AVX512: ## %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] -; AVX512-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512-NEXT: vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 ; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vroundps $11, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -304,10 +385,8 @@ ; ; AVX512-LABEL: round_v4f64: ; AVX512: ## %bb.0: -; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512-NEXT: vandpd %ymm1, %ymm0, %ymm1 -; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] -; AVX512-NEXT: vorpd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 ; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -561,6 +640,7 @@ ret <8 x double> %a } +declare half @llvm.round.f16(half) declare float @llvm.round.f32(float) declare double @llvm.round.f64(double) declare <4 x float> @llvm.round.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll --- a/llvm/test/CodeGen/X86/fp-roundeven.ll +++ b/llvm/test/CodeGen/X86/fp-roundeven.ll @@ -2,7 +2,62 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512FP16 + +define half @roundeven_f16(half %h) { +; SSE2-LABEL: roundeven_f16: +; SSE2: ## %bb.0: ## %entry +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: movzwl %di, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundevenf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: retq +; +; SSE41-LABEL: roundeven_f16: +; SSE41: ## %bb.0: ## %entry +; SSE41-NEXT: pushq %rax +; SSE41-NEXT: .cfi_def_cfa_offset 16 +; SSE41-NEXT: movzwl %di, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: roundss $8, %xmm0, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: popq %rcx +; SSE41-NEXT: retq +; +; AVX1-LABEL: roundeven_f16: +; AVX1: ## %bb.0: ## %entry +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: movzwl %di, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: popq %rcx +; AVX1-NEXT: retq +; +; AVX512F-LABEL: roundeven_f16: +; AVX512F: ## %bb.0: ## %entry +; AVX512F-NEXT: movzwl %di, %eax +; AVX512F-NEXT: vmovd %eax, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: ## kill: def $ax killed $ax killed $eax +; AVX512F-NEXT: retq +; +; AVX512FP16-LABEL: roundeven_f16: +; AVX512FP16: ## %bb.0: ## %entry +; AVX512FP16-NEXT: vrndscalesh $8, %xmm0, %xmm0, %xmm0 +; AVX512FP16-NEXT: retq +entry: + %a = call half @llvm.roundeven.f16(half %h) + ret half %a +} define float @roundeven_f32(float %x) { ; SSE2-LABEL: roundeven_f32: @@ -408,6 +463,7 @@ ret <8 x double> %a } +declare half @llvm.roundeven.f16(half) declare float @llvm.roundeven.f32(float) declare double @llvm.roundeven.f64(double) declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll @@ -7,6 +7,7 @@ declare half @llvm.experimental.constrained.trunc.f16(half, metadata) declare half @llvm.experimental.constrained.rint.f16(half, metadata, metadata) declare half @llvm.experimental.constrained.nearbyint.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.roundeven.f16(half, metadata) define half @fceil32(half %f) #0 { ; X86-LABEL: fceil32: @@ -85,4 +86,20 @@ ret half %res } +define half @froundeven16(half %f) #0 { +; X86-LABEL: froundeven16: +; X86: # %bb.0: +; X86-NEXT: vrndscalesh $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: froundeven16: +; X64: # %bb.0: +; X64-NEXT: vrndscalesh $8, %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq + + %res = call half @llvm.experimental.constrained.roundeven.f16( + half %f, metadata !"fpexcept.strict") #0 + ret half %res +} + attributes #0 = { strictfp }