diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1078,6 +1078,8 @@ setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); + + setOperationAction(ISD::FROUND, RoundedTy, Custom); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); @@ -1170,6 +1172,9 @@ setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + + setOperationAction(ISD::FROUND, VT, Custom); + setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); @@ -1535,6 +1540,8 @@ setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + setOperationAction(ISD::FROUND, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); } @@ -20450,6 +20457,30 @@ return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); } +/// ISD::FROUND is defined to round to nearest with ties rounding away from 0. +/// This mode isn't supported in hardware on X86. But as long as we aren't +/// compiling with trapping math, we can emulate this with +/// floor(X + copysign(nextafter(0.5, 0.0), X)). +static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) { + SDValue N0 = Op.getOperand(0); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + // N0 += copysign(nextafter(0.5, 0.0), N0) + const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); + bool Ignored; + APFloat Point5Pred = APFloat(0.5f); + Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored); + Point5Pred.next(/*nextDown*/true); + + SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT, + DAG.getConstantFP(Point5Pred, dl, VT), N0); + N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder); + + // Truncate the result to remove fraction. + return DAG.getNode(ISD::FTRUNC, dl, VT, N0); +} + /// The only differences between FABS and FNEG are the mask and the logic op. /// FNEG also has a folding opportunity for FNEG(FABS(x)). static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { @@ -28623,6 +28654,7 @@ case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: case ISD::FSUB: return lowerFaddFsub(Op, DAG); + case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -1067,13 +1067,25 @@ define float @round_v4f32(<4 x float> %x) nounwind { ; X64-LABEL: round_v4f32: ; X64: # %bb.0: -; X64-NEXT: jmp roundf # TAILCALL +; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vandps %xmm1, %xmm0, %xmm1 +; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; X64-NEXT: vorps %xmm1, %xmm2, %xmm1 +; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; X64-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq ; ; X86-LABEL: round_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax +; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vandps %xmm1, %xmm0, %xmm1 +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; X86-NEXT: vorps %xmm1, %xmm2, %xmm1 +; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; X86-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) -; X86-NEXT: calll roundf +; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax ; X86-NEXT: retl %v = call <4 x float> @llvm.round.v4f32(<4 x float> %x) @@ -1084,17 +1096,32 @@ define double @round_v4f64(<4 x double> %x) nounwind { ; X64-LABEL: round_v4f64: ; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X64-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm1 +; X64-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] +; X64-NEXT: # xmm2 = mem[0,0] +; X64-NEXT: vorpd %xmm1, %xmm2, %xmm1 +; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 ; X64-NEXT: vzeroupper -; X64-NEXT: jmp round # TAILCALL +; X64-NEXT: retq ; ; X86-LABEL: round_v4f64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vmovlps %xmm0, (%esp) +; X86-NEXT: vandpd {{\.LCPI.*}}, %xmm0, %xmm1 +; X86-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] +; X86-NEXT: # xmm2 = mem[0,0] +; X86-NEXT: vorpd %xmm1, %xmm2, %xmm1 +; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovsd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: vzeroupper -; X86-NEXT: calll round -; X86-NEXT: addl $8, %esp ; X86-NEXT: retl %v = call <4 x double> @llvm.round.v4f64(<4 x double> %x) %r = extractelement <4 x double> %v, i32 0 diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll --- a/llvm/test/CodeGen/X86/vec-libcalls.ll +++ b/llvm/test/CodeGen/X86/vec-libcalls.ll @@ -386,16 +386,10 @@ define <2 x float> @round_v2f32(<2 x float> %x) nounwind { ; CHECK-LABEL: round_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq roundf -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq roundf -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-NEXT: vorps {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vroundps $11, %xmm0, %xmm0 ; CHECK-NEXT: retq %r = call <2 x float> @llvm.round.v2f32(<2 x float> %x) ret <2 x float> %r diff --git a/llvm/test/CodeGen/X86/vec_round.ll b/llvm/test/CodeGen/X86/vec_round.ll deleted file mode 100644 --- a/llvm/test/CodeGen/X86/vec_round.ll +++ /dev/null @@ -1,30 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mcpu=nehalem -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -declare void @use(<2 x double>) - -; Function Attrs: nounwind uwtable -define void @test() { -; CHECK-LABEL: test: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: callq round -; CHECK-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT: callq use -; CHECK-NEXT: popq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq -entry: - %tmp = call <2 x double> @llvm.round.v2f64(<2 x double> undef) - call void @use(<2 x double> %tmp) - ret void -} - -; Function Attrs: nounwind readonly -declare <2 x double> @llvm.round.v2f64(<2 x double>) #0 - -attributes #0 = { nounwind readonly } -