diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1078,6 +1078,8 @@ setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); + + setOperationAction(ISD::FROUND, RoundedTy, Custom); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); @@ -1170,6 +1172,9 @@ setOperationAction(ISD::STRICT_FRINT, VT, Legal); setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + + setOperationAction(ISD::FROUND, VT, Custom); + setOperationAction(ISD::FNEG, VT, Custom); setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Custom); @@ -1535,6 +1540,8 @@ setOperationAction(ISD::FNEARBYINT, VT, Legal); setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); + setOperationAction(ISD::FROUND, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); } @@ -20450,6 +20457,26 @@ return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); } +static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) { + SDValue N0 = Op.getOperand(0); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + // N0 += copysign(nextafter(0.5, 0.0), N0) + const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); + bool Ignored; + APFloat Point5Pred = APFloat(0.5f); + Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored); + Point5Pred.next(/*nextDown*/true); + + SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT, + DAG.getConstantFP(Point5Pred, dl, VT), N0); + N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder); + + // Truncate the result to remove fraction. + return DAG.getNode(ISD::FTRUNC, dl, VT, N0); +} + /// The only differences between FABS and FNEG are the mask and the logic op. /// FNEG also has a folding opportunity for FNEG(FABS(x)). static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { @@ -28625,6 +28652,7 @@ case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: case ISD::FSUB: return lowerFaddFsub(Op, DAG); + case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -1067,13 +1067,25 @@ define float @round_v4f32(<4 x float> %x) nounwind { ; X64-LABEL: round_v4f32: ; X64: # %bb.0: -; X64-NEXT: jmp roundf # TAILCALL +; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X64-NEXT: vandps %xmm1, %xmm0, %xmm1 +; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; X64-NEXT: vorps %xmm1, %xmm2, %xmm1 +; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; X64-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; X64-NEXT: retq ; ; X86-LABEL: round_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax +; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; X86-NEXT: vandps %xmm1, %xmm0, %xmm1 +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; X86-NEXT: vorps %xmm1, %xmm2, %xmm1 +; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; X86-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) -; X86-NEXT: calll roundf +; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax ; X86-NEXT: retl %v = call <4 x float> @llvm.round.v4f32(<4 x float> %x) @@ -1084,17 +1096,32 @@ define double @round_v4f64(<4 x double> %x) nounwind { ; X64-LABEL: round_v4f64: ; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X64-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm1 +; X64-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] +; X64-NEXT: # xmm2 = mem[0,0] +; X64-NEXT: vorpd %xmm1, %xmm2, %xmm1 +; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 ; X64-NEXT: vzeroupper -; X64-NEXT: jmp round # TAILCALL +; X64-NEXT: retq ; ; X86-LABEL: round_v4f64: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vmovlps %xmm0, (%esp) +; X86-NEXT: vandpd {{\.LCPI.*}}, %xmm0, %xmm1 +; X86-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] +; X86-NEXT: # xmm2 = mem[0,0] +; X86-NEXT: vorpd %xmm1, %xmm2, %xmm1 +; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 +; X86-NEXT: vmovsd %xmm0, (%esp) +; X86-NEXT: fldl (%esp) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: vzeroupper -; X86-NEXT: calll round -; X86-NEXT: addl $8, %esp ; X86-NEXT: retl %v = call <4 x double> @llvm.round.v4f64(<4 x double> %x) %r = extractelement <4 x double> %v, i32 0 diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -0,0 +1,573 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 + +define float @round_f32(float %x) { +; SSE2-LABEL: round_f32: +; SSE2: ## %bb.0: +; SSE2-NEXT: jmp _roundf ## TAILCALL +; +; SSE41-LABEL: round_f32: +; SSE41: ## %bb.0: +; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: orps %xmm1, %xmm2 +; SSE41-NEXT: addss %xmm0, %xmm2 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundss $11, %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: round_f32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vorps %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: round_f32: +; AVX512: ## %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512-NEXT: vorps %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq + %a = call float @llvm.round.f32(float %x) + ret float %a +} + +define double @round_f64(double %x) { +; SSE2-LABEL: round_f64: +; SSE2: ## %bb.0: +; SSE2-NEXT: jmp _round ## TAILCALL +; +; SSE41-LABEL: round_f64: +; SSE41: ## %bb.0: +; SSE41-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; SSE41-NEXT: andpd %xmm0, %xmm1 +; SSE41-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE41-NEXT: orpd %xmm1, %xmm2 +; SSE41-NEXT: addsd %xmm0, %xmm2 +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: roundsd $11, %xmm2, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: round_f64: +; AVX: ## %bb.0: +; AVX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] +; AVX-NEXT: ## xmm2 = mem[0,0] +; AVX-NEXT: vorpd %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %a = call double @llvm.round.f64(double %x) + ret double %a +} + +define <4 x float> @round_v4f32(<4 x float> %x) { +; SSE2-LABEL: round_v4f32: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $56, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 64 +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addq $56, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: round_v4f32: +; SSE41: ## %bb.0: +; SSE41-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE41-NEXT: andps %xmm0, %xmm1 +; SSE41-NEXT: orps {{.*}}(%rip), %xmm1 +; SSE41-NEXT: addps %xmm0, %xmm1 +; SSE41-NEXT: roundps $11, %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX1-LABEL: round_v4f32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm1 +; AVX1-NEXT: vorps {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vroundps $11, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: round_v4f32: +; AVX512: ## %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512-NEXT: vorps %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vroundps $11, %xmm0, %xmm0 +; AVX512-NEXT: retq + %a = call <4 x float> @llvm.round.v4f32(<4 x float> %x) + ret <4 x float> %a +} + +define <2 x double> @round_v2f64(<2 x double> %x) { +; SSE2-LABEL: round_v2f64: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $40, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 48 +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: addq $40, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: round_v2f64: +; SSE41: ## %bb.0: +; SSE41-NEXT: movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] +; SSE41-NEXT: andpd %xmm0, %xmm1 +; SSE41-NEXT: orpd {{.*}}(%rip), %xmm1 +; SSE41-NEXT: addpd %xmm0, %xmm1 +; SSE41-NEXT: roundpd $11, %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: round_v2f64: +; AVX: ## %bb.0: +; AVX-NEXT: vandpd {{.*}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vorpd {{.*}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vroundpd $11, %xmm0, %xmm0 +; AVX-NEXT: retq + %a = call <2 x double> @llvm.round.v2f64(<2 x double> %x) + ret <2 x double> %a +} + +define <8 x float> @round_v8f32(<8 x float> %x) { +; SSE2-LABEL: round_v8f32: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $72, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 80 +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd (%rsp), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: addq $72, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: round_v8f32: +; SSE41: ## %bb.0: +; SSE41-NEXT: movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: andps %xmm2, %xmm3 +; SSE41-NEXT: movaps {{.*#+}} xmm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; SSE41-NEXT: orps %xmm4, %xmm3 +; SSE41-NEXT: addps %xmm0, %xmm3 +; SSE41-NEXT: roundps $11, %xmm3, %xmm0 +; SSE41-NEXT: andps %xmm1, %xmm2 +; SSE41-NEXT: orps %xmm4, %xmm2 +; SSE41-NEXT: addps %xmm1, %xmm2 +; SSE41-NEXT: roundps $11, %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: round_v8f32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm1 +; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: round_v8f32: +; AVX512: ## %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512-NEXT: vandps %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX512-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX512-NEXT: retq + %a = call <8 x float> @llvm.round.v8f32(<8 x float> %x) + ret <8 x float> %a +} + +define <4 x double> @round_v4f64(<4 x double> %x) { +; SSE2-LABEL: round_v4f64: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $56, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 64 +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: addq $56, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: round_v4f64: +; SSE41: ## %bb.0: +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; SSE41-NEXT: movapd %xmm0, %xmm3 +; SSE41-NEXT: andpd %xmm2, %xmm3 +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4.9999999999999994E-1,4.9999999999999994E-1] +; SSE41-NEXT: orpd %xmm4, %xmm3 +; SSE41-NEXT: addpd %xmm0, %xmm3 +; SSE41-NEXT: roundpd $11, %xmm3, %xmm0 +; SSE41-NEXT: andpd %xmm1, %xmm2 +; SSE41-NEXT: orpd %xmm4, %xmm2 +; SSE41-NEXT: addpd %xmm1, %xmm2 +; SSE41-NEXT: roundpd $11, %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX1-LABEL: round_v4f64: +; AVX1: ## %bb.0: +; AVX1-NEXT: vandpd {{.*}}(%rip), %ymm0, %ymm1 +; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX512-LABEL: round_v4f64: +; AVX512: ## %bb.0: +; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX512-NEXT: vandpd %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX512-NEXT: vorpd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX512-NEXT: retq + %a = call <4 x double> @llvm.round.v4f64(<4 x double> %x) + ret <4 x double> %a +} + +define <16 x float> @round_v16f32(<16 x float> %x) { +; SSE2-LABEL: round_v16f32: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $104, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 112 +; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm1 = xmm1[0],mem[0] +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: callq _roundf +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload +; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; SSE2-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Folded Reload +; SSE2-NEXT: ## xmm3 = xmm3[0],mem[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload +; SSE2-NEXT: addq $104, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: round_v16f32: +; SSE41: ## %bb.0: +; SSE41-NEXT: movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; SSE41-NEXT: movaps %xmm0, %xmm5 +; SSE41-NEXT: andps %xmm4, %xmm5 +; SSE41-NEXT: movaps {{.*#+}} xmm6 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; SSE41-NEXT: orps %xmm6, %xmm5 +; SSE41-NEXT: addps %xmm0, %xmm5 +; SSE41-NEXT: roundps $11, %xmm5, %xmm0 +; SSE41-NEXT: movaps %xmm1, %xmm5 +; SSE41-NEXT: andps %xmm4, %xmm5 +; SSE41-NEXT: orps %xmm6, %xmm5 +; SSE41-NEXT: addps %xmm1, %xmm5 +; SSE41-NEXT: roundps $11, %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm5 +; SSE41-NEXT: andps %xmm4, %xmm5 +; SSE41-NEXT: orps %xmm6, %xmm5 +; SSE41-NEXT: addps %xmm2, %xmm5 +; SSE41-NEXT: roundps $11, %xmm5, %xmm2 +; SSE41-NEXT: andps %xmm3, %xmm4 +; SSE41-NEXT: orps %xmm6, %xmm4 +; SSE41-NEXT: addps %xmm3, %xmm4 +; SSE41-NEXT: roundps $11, %xmm4, %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: round_v16f32: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3 +; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vroundps $11, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX512-LABEL: round_v16f32: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 +; AVX512-NEXT: vpord {{.*}}(%rip){1to16}, %zmm1, %zmm1 +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0 +; AVX512-NEXT: retq + %a = call <16 x float> @llvm.round.v16f32(<16 x float> %x) + ret <16 x float> %a +} + +define <8 x double> @round_v8f64(<8 x double> %x) { +; SSE2-LABEL: round_v8f64: +; SSE2: ## %bb.0: +; SSE2-NEXT: subq $88, %rsp +; SSE2-NEXT: .cfi_def_cfa_offset 96 +; SSE2-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, (%rsp) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: callq _round +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 ## 16-byte Reload +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload +; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload +; SSE2-NEXT: movaps (%rsp), %xmm2 ## 16-byte Reload +; SSE2-NEXT: addq $88, %rsp +; SSE2-NEXT: retq +; +; SSE41-LABEL: round_v8f64: +; SSE41: ## %bb.0: +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0] +; SSE41-NEXT: movapd %xmm0, %xmm5 +; SSE41-NEXT: andpd %xmm4, %xmm5 +; SSE41-NEXT: movapd {{.*#+}} xmm6 = [4.9999999999999994E-1,4.9999999999999994E-1] +; SSE41-NEXT: orpd %xmm6, %xmm5 +; SSE41-NEXT: addpd %xmm0, %xmm5 +; SSE41-NEXT: roundpd $11, %xmm5, %xmm0 +; SSE41-NEXT: movapd %xmm1, %xmm5 +; SSE41-NEXT: andpd %xmm4, %xmm5 +; SSE41-NEXT: orpd %xmm6, %xmm5 +; SSE41-NEXT: addpd %xmm1, %xmm5 +; SSE41-NEXT: roundpd $11, %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm2, %xmm5 +; SSE41-NEXT: andpd %xmm4, %xmm5 +; SSE41-NEXT: orpd %xmm6, %xmm5 +; SSE41-NEXT: addpd %xmm2, %xmm5 +; SSE41-NEXT: roundpd $11, %xmm5, %xmm2 +; SSE41-NEXT: andpd %xmm3, %xmm4 +; SSE41-NEXT: orpd %xmm6, %xmm4 +; SSE41-NEXT: addpd %xmm3, %xmm4 +; SSE41-NEXT: roundpd $11, %xmm4, %xmm3 +; SSE41-NEXT: retq +; +; AVX1-LABEL: round_v8f64: +; AVX1: ## %bb.0: +; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3 +; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX1-NEXT: vorpd %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 +; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm2 +; AVX1-NEXT: vorpd %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vroundpd $11, %ymm1, %ymm1 +; AVX1-NEXT: retq +; +; AVX512-LABEL: round_v8f64: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 +; AVX512-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0 +; AVX512-NEXT: retq + %a = call <8 x double> @llvm.round.v8f64(<8 x double> %x) + ret <8 x double> %a +} + +declare float @llvm.round.f32(float) +declare double @llvm.round.f64(double) +declare <4 x float> @llvm.round.v4f32(<4 x float>) +declare <2 x double> @llvm.round.v2f64(<2 x double>) +declare <8 x float> @llvm.round.v8f32(<8 x float>) +declare <4 x double> @llvm.round.v4f64(<4 x double>) +declare <16 x float> @llvm.round.v16f32(<16 x float>) +declare <8 x double> @llvm.round.v8f64(<8 x double>) diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll --- a/llvm/test/CodeGen/X86/vec-libcalls.ll +++ b/llvm/test/CodeGen/X86/vec-libcalls.ll @@ -386,16 +386,10 @@ define <2 x float> @round_v2f32(<2 x float> %x) nounwind { ; CHECK-LABEL: round_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $40, %rsp -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: callq roundf -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[1,1,3,3] -; CHECK-NEXT: callq roundf -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm1 +; CHECK-NEXT: vorps {{.*}}(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vroundps $11, %xmm0, %xmm0 ; CHECK-NEXT: retq %r = call <2 x float> @llvm.round.v2f32(<2 x float> %x) ret <2 x float> %r diff --git a/llvm/test/CodeGen/X86/vec_round.ll b/llvm/test/CodeGen/X86/vec_round.ll --- a/llvm/test/CodeGen/X86/vec_round.ll +++ b/llvm/test/CodeGen/X86/vec_round.ll @@ -11,8 +11,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: callq round -; CHECK-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN] ; CHECK-NEXT: callq use ; CHECK-NEXT: popq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 8