diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1951,6 +1951,10 @@ setOperationAction(ISD::SETCC, MVT::f16, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); + setOperationAction(ISD::FROUND, MVT::f16, Custom); + setOperationAction(ISD::STRICT_FROUND, MVT::f16, Custom); + setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); + setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); @@ -22496,6 +22500,10 @@ /// compiling with trapping math, we can emulate this with /// floor(X + copysign(nextafter(0.5, 0.0), X)). static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) { + if (Op.getOpcode() == ISD::STRICT_FROUND && + Op.getSimpleValueType() == MVT::f16) + report_fatal_error("For now cannot emit strict round(fp16) at backend for " + "lacking library support."); SDValue N0 = Op.getOperand(0); SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); @@ -31064,6 +31072,7 @@ case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: case ISD::FSUB: return lowerFaddFsub(Op, DAG); + case ISD::STRICT_FROUND: case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -3,6 +3,63 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=AVX512FP16 + +define half @round_f16(half %h) { +; SSE2-LABEL: round_f16: +; SSE2: ## %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: .cfi_def_cfa_offset 16 +; SSE2-NEXT: movzwl %di, %edi +; SSE2-NEXT: callq ___extendhfsf2 +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: callq ___truncsfhf2 +; SSE2-NEXT: popq %rcx +; SSE2-NEXT: retq +; +; SSE41-LABEL: round_f16: +; SSE41: ## %bb.0: +; SSE41-NEXT: pushq %rax +; SSE41-NEXT: .cfi_def_cfa_offset 16 +; SSE41-NEXT: movzwl %di, %edi +; SSE41-NEXT: callq ___extendhfsf2 +; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: addss %xmm0, %xmm1 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm1, %xmm0 +; SSE41-NEXT: callq ___truncsfhf2 +; SSE41-NEXT: popq %rcx +; SSE41-NEXT: retq +; +; AVX1-LABEL: round_f16: +; AVX1: ## %bb.0: +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: movzwl %di, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: popq %rcx +; AVX1-NEXT: retq +; +; AVX512FP16-LABEL: round_f16: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1,4.9976E-1] +; AVX512FP16-NEXT: vpternlogq $248, %xmm1, %xmm0, %xmm2 +; AVX512FP16-NEXT: vaddsh %xmm2, %xmm0, %xmm0 +; AVX512FP16-NEXT: vrndscalesh $11, %xmm0, %xmm0, %xmm0 +; AVX512FP16-NEXT: retq +entry: + %a = call half @llvm.round.f16(half %h) + ret half %a +} define float @round_f32(float %x) { ; SSE2-LABEL: round_f32: @@ -561,6 +618,7 @@ ret <8 x double> %a } +declare half @llvm.round.f16(half) declare float @llvm.round.f32(float) declare double @llvm.round.f64(double) declare <4 x float> @llvm.round.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/X86/fp-roundeven.ll b/llvm/test/CodeGen/X86/fp-roundeven.ll --- a/llvm/test/CodeGen/X86/fp-roundeven.ll +++ b/llvm/test/CodeGen/X86/fp-roundeven.ll @@ -3,6 +3,50 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=AVX512FP16 + +define half @roundeven_f16(half %h) { +; SSE-LABEL: roundeven_f16: +; SSE: ## %bb.0: +; SSE-NEXT: pushq %rax +; SSE-NEXT: .cfi_def_cfa_offset 16 +; SSE-NEXT: movzwl %di, %edi +; SSE-NEXT: callq ___extendhfsf2 +; SSE-NEXT: callq _roundevenf +; SSE-NEXT: callq ___truncsfhf2 +; SSE-NEXT: popq %rcx +; SSE-NEXT: retq +; +; SSE4_1-LABEL: roundeven_f16: +; SSE4_1: ## %bb.0: +; SSE4_1-NEXT: pushq %rax +; SSE4_1-NEXT: .cfi_def_cfa_offset 16 +; SSE4_1-NEXT: movzwl %di, %edi +; SSE4_1-NEXT: callq ___extendhfsf2 +; SSE4_1-NEXT: roundss $8, %xmm0, %xmm0 +; SSE4_1-NEXT: callq ___truncsfhf2 +; SSE4_1-NEXT: popq %rcx +; SSE4_1-NEXT: retq +; +; AVX1-LABEL: roundeven_f16: +; AVX1: ## %bb.0: +; AVX1-NEXT: pushq %rax +; AVX1-NEXT: .cfi_def_cfa_offset 16 +; AVX1-NEXT: movzwl %di, %edi +; AVX1-NEXT: callq ___extendhfsf2 +; AVX1-NEXT: vroundss $8, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: callq ___truncsfhf2 +; AVX1-NEXT: popq %rcx +; AVX1-NEXT: retq +; +; AVX512FP16-LABEL: roundeven_f16: +; AVX512FP16: # %bb.0: +; AVX512FP16-NEXT: vrndscalesh $8, %xmm0, %xmm0, %xmm0 +; AVX512FP16-NEXT: retq +entry: + %a = call half @llvm.roundeven.f16(half %h) + ret half %a +} define float @roundeven_f32(float %x) { ; SSE2-LABEL: roundeven_f32: @@ -408,6 +452,7 @@ ret <8 x double> %a } +declare half @llvm.roundeven.f16(half) declare float @llvm.roundeven.f32(float) declare double @llvm.roundeven.f64(double) declare <4 x float> @llvm.roundeven.v4f32(<4 x float>) diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll --- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll @@ -7,6 +7,7 @@ declare half @llvm.experimental.constrained.trunc.f16(half, metadata) declare half @llvm.experimental.constrained.rint.f16(half, metadata, metadata) declare half @llvm.experimental.constrained.nearbyint.f16(half, metadata, metadata) +declare half @llvm.experimental.constrained.roundeven.f16(half, metadata) define half @fceil32(half %f) #0 { ; X86-LABEL: fceil32: @@ -85,4 +86,20 @@ ret half %res } +define half @froundeven16(half %f) #0 { +; X86-LABEL: froundeven16: +; X86: # %bb.0: +; X86-NEXT: vrndscalesh $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: froundeven16: +; X64: # %bb.0: +; X64-NEXT: vrndscalesh $8, %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq + + %res = call half @llvm.experimental.constrained.roundeven.f16( + half %f, metadata !"fpexcept.strict") #0 + ret half %res +} + attributes #0 = { strictfp }