Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -14485,6 +14485,78 @@ mode argument is only intended as information to the compiler. +'``llvm.experimental.constrained.ceil``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.ceil( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.ceil``' intrinsic returns the ceiling of the +first operand. + +Arguments: +"""""""""" + +The first argument and the return value are floating-point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. The rounding mode is currently unused for this +intrinsic. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``ceil`` functions +would and handles error conditions in the same way. + + +'``llvm.experimental.constrained.floor``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare + @llvm.experimental.constrained.floor( , + metadata , + metadata ) + +Overview: +""""""""" + +The '``llvm.experimental.constrained.floor``' intrinsic returns the floor of the +first operand. + +Arguments: +"""""""""" + +The first argument and the return value are floating-point numbers of the same +type. + +The second and third arguments specify the rounding mode and exception +behavior as described above. The rounding mode is currently unused for this +intrinsic. + +Semantics: +"""""""""" + +This function returns the same values as the libm ``floor`` functions +would and handles error conditions in the same way. + + General Intrinsics ------------------ Index: include/llvm/CodeGen/ISDOpcodes.h =================================================================== --- include/llvm/CodeGen/ISDOpcodes.h +++ include/llvm/CodeGen/ISDOpcodes.h @@ -280,7 +280,7 @@ /// They are used to limit optimizations while the DAG is being optimized. STRICT_FSQRT, STRICT_FPOW, STRICT_FPOWI, STRICT_FSIN, STRICT_FCOS, STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, STRICT_FLOG10, STRICT_FLOG2, - STRICT_FRINT, STRICT_FNEARBYINT, + STRICT_FRINT, STRICT_FNEARBYINT, STRICT_FCEIL, STRICT_FFLOOR, /// FMA - Perform a * b + c with no intermediate rounding step. FMA, Index: include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- include/llvm/CodeGen/SelectionDAGNodes.h +++ include/llvm/CodeGen/SelectionDAGNodes.h @@ -672,6 +672,8 @@ case ISD::STRICT_FLOG2: case ISD::STRICT_FRINT: case ISD::STRICT_FNEARBYINT: + case ISD::STRICT_FCEIL: + case ISD::STRICT_FFLOOR: return true; } } Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -819,6 +819,8 @@ case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break; case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break; case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break; + case ISD::STRICT_FCEIL: EqOpc = ISD::FCEIL; break; + case ISD::STRICT_FFLOOR: EqOpc = ISD::FFLOOR; break; } auto Action = getOperationAction(EqOpc, VT); Index: include/llvm/IR/IntrinsicInst.h =================================================================== --- include/llvm/IR/IntrinsicInst.h +++ include/llvm/IR/IntrinsicInst.h @@ -251,6 +251,8 @@ case Intrinsic::experimental_constrained_log2: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: + case Intrinsic::experimental_constrained_ceil: + case Intrinsic::experimental_constrained_floor: return true; default: return false; } Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -565,9 +565,17 @@ [ LLVMMatchType<0>, llvm_metadata_ty, llvm_metadata_ty ]>; + def int_experimental_constrained_ceil : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; + def int_experimental_constrained_floor : Intrinsic<[ llvm_anyfloat_ty ], + [ LLVMMatchType<0>, + llvm_metadata_ty, + llvm_metadata_ty ]>; } // FIXME: Add intrinsics for fcmp, fptrunc, fpext, fptoui and fptosi. -// FIXME: Add intrinsics for fabs, copysign, floor, ceil, trunc and round? +// FIXME: Add intrinsics for fabs, copysign, trunc and round? //===------------------------- Expect Intrinsics --------------------------===// Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1108,6 +1108,8 @@ case ISD::STRICT_FLOG2: case ISD::STRICT_FRINT: case ISD::STRICT_FNEARBYINT: + case ISD::STRICT_FCEIL: + case ISD::STRICT_FFLOOR: // These pseudo-ops get legalized as if they were their non-strict // equivalent. For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT // is also legal, but if ISD::FSQRT requires expansion then so does @@ -4137,11 +4139,13 @@ RTLIB::TRUNC_PPCF128)); break; case ISD::FFLOOR: + case ISD::STRICT_FFLOOR: Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64, RTLIB::FLOOR_F80, RTLIB::FLOOR_F128, RTLIB::FLOOR_PPCF128)); break; case ISD::FCEIL: + case ISD::STRICT_FCEIL: Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64, RTLIB::CEIL_F80, RTLIB::CEIL_F128, RTLIB::CEIL_PPCF128)); Index: lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -303,6 +303,8 @@ case ISD::STRICT_FLOG2: case ISD::STRICT_FRINT: case ISD::STRICT_FNEARBYINT: + case ISD::STRICT_FCEIL: + case ISD::STRICT_FFLOOR: // These pseudo-ops get legalized as if they were their non-strict // equivalent. For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT // is also legal, but if ISD::FSQRT requires expansion then so does @@ -739,6 +741,8 @@ case ISD::STRICT_FLOG2: case ISD::STRICT_FRINT: case ISD::STRICT_FNEARBYINT: + case ISD::STRICT_FCEIL: + case ISD::STRICT_FFLOOR: return ExpandStrictFPOp(Op); default: return DAG.UnrollVectorOp(Op.getNode()); Index: lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -159,6 +159,8 @@ case ISD::STRICT_FLOG2: case ISD::STRICT_FRINT: case ISD::STRICT_FNEARBYINT: + case ISD::STRICT_FCEIL: + case ISD::STRICT_FFLOOR: R = ScalarizeVecRes_StrictFPOp(N); break; } @@ -826,6 +828,8 @@ case ISD::STRICT_FLOG2: case ISD::STRICT_FRINT: case ISD::STRICT_FNEARBYINT: + case ISD::STRICT_FCEIL: + case ISD::STRICT_FFLOOR: SplitVecRes_StrictFPOp(N, Lo, Hi); break; } @@ -2392,6 +2396,8 @@ case ISD::STRICT_FLOG2: case ISD::STRICT_FRINT: case ISD::STRICT_FNEARBYINT: + case ISD::STRICT_FCEIL: + case ISD::STRICT_FFLOOR: Res = WidenVecRes_StrictFP(N); break; Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7311,6 +7311,8 @@ NewOpc = ISD::FNEARBYINT; IsUnary = true; break; + case ISD::STRICT_FCEIL: NewOpc = ISD::FCEIL; IsUnary = true; break; + case ISD::STRICT_FFLOOR: NewOpc = ISD::FFLOOR; IsUnary = true; break; } // We're taking this node out of the chain, so we need to re-link things. Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5627,6 +5627,8 @@ case Intrinsic::experimental_constrained_log2: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: + case Intrinsic::experimental_constrained_ceil: + case Intrinsic::experimental_constrained_floor: visitConstrainedFPIntrinsic(cast(I)); return nullptr; case Intrinsic::fmuladd: { @@ -6356,6 +6358,12 @@ case Intrinsic::experimental_constrained_nearbyint: Opcode = ISD::STRICT_FNEARBYINT; break; + case Intrinsic::experimental_constrained_ceil: + Opcode = ISD::STRICT_FCEIL; + break; + case Intrinsic::experimental_constrained_floor: + Opcode = ISD::STRICT_FFLOOR; + break; } const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Chain = getRoot(); Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -189,7 +189,9 @@ case ISD::FSINCOS: return "fsincos"; case ISD::FTRUNC: return "ftrunc"; case ISD::FFLOOR: return "ffloor"; + case ISD::STRICT_FFLOOR: return "strict_ffloor"; case ISD::FCEIL: return "fceil"; + case ISD::STRICT_FCEIL: return "strict_fceil"; case ISD::FRINT: return "frint"; case ISD::STRICT_FRINT: return "strict_frint"; case ISD::FNEARBYINT: return "fnearbyint"; Index: lib/IR/IntrinsicInst.cpp =================================================================== --- lib/IR/IntrinsicInst.cpp +++ lib/IR/IntrinsicInst.cpp @@ -152,6 +152,8 @@ case Intrinsic::experimental_constrained_log2: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: + case Intrinsic::experimental_constrained_ceil: + case Intrinsic::experimental_constrained_floor: return true; } } Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -4104,6 +4104,8 @@ case Intrinsic::experimental_constrained_log2: case Intrinsic::experimental_constrained_rint: case Intrinsic::experimental_constrained_nearbyint: + case Intrinsic::experimental_constrained_ceil: + case Intrinsic::experimental_constrained_floor: visitConstrainedFPIntrinsic( cast(*CS.getInstruction())); break; Index: test/CodeGen/X86/vector-constrained-fp-intrinsics.ll =================================================================== --- test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -3668,6 +3668,266 @@ ret <4 x double> %nearby } +define <1 x float> @constrained_vector_ceil_v1f32() { +; NO-FMA-LABEL: constrained_vector_ceil_v1f32: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: pushq %rax +; NO-FMA-NEXT: .cfi_def_cfa_offset 16 +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq ceilf +; NO-FMA-NEXT: popq %rax +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_ceil_v1f32: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; HAS-FMA-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 +; HAS-FMA-NEXT: retq +entry: + %ceil = call <1 x float> @llvm.experimental.constrained.ceil.v1f32( + <1 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <1 x float> %ceil +} + +define <2 x double> @constrained_vector_ceil_v2f64() { +; NO-FMA-LABEL: constrained_vector_ceil_v2f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq ceil +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq ceil +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_ceil_v2f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vroundpd $10, {{.*}}(%rip), %xmm0 +; HAS-FMA-NEXT: retq +entry: + %ceil = call <2 x double> @llvm.experimental.constrained.ceil.v2f64( + <2 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <2 x double> %ceil +} + +define <3 x float> @constrained_vector_ceil_v3f32() { +; NO-FMA-LABEL: constrained_vector_ceil_v3f32: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq ceilf +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq ceilf +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq ceilf +; NO-FMA-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps %xmm1, %xmm0 +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_ceil_v3f32: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; HAS-FMA-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 +; HAS-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HAS-FMA-NEXT: vroundss $10, %xmm1, %xmm1, %xmm1 +; HAS-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; HAS-FMA-NEXT: vroundss $10, %xmm2, %xmm2, %xmm2 +; HAS-FMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; HAS-FMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; HAS-FMA-NEXT: retq +entry: + %ceil = call <3 x float> @llvm.experimental.constrained.ceil.v3f32( + <3 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <3 x float> %ceil +} + +define <3 x double> @constrained_vector_ceil_v3f64() {xentry: +; NO-FMA-LABEL: constrained_vector_ceil_v3f64: +; NO-FMA: # %bb.0: # %xentry +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq ceil +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq ceil +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq ceil +; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) +; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_ceil_v3f64: +; HAS-FMA: # %bb.0: # %xentry +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 +; HAS-FMA-NEXT: vroundpd $10, {{.*}}(%rip), %xmm1 +; HAS-FMA-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; HAS-FMA-NEXT: retq + %ceil = call <3 x double> @llvm.experimental.constrained.ceil.v3f64( + <3 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <3 x double> %ceil +} + +define <1 x float> @constrained_vector_floor_v1f32() { +; NO-FMA-LABEL: constrained_vector_floor_v1f32: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: pushq %rax +; NO-FMA-NEXT: .cfi_def_cfa_offset 16 +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq floorf +; NO-FMA-NEXT: popq %rax +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_floor_v1f32: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; HAS-FMA-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 +; HAS-FMA-NEXT: retq +entry: + %floor = call <1 x float> @llvm.experimental.constrained.floor.v1f32( + <1 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <1 x float> %floor +} + +define <2 x double> @constrained_vector_floor_v2f64() { +; NO-FMA-LABEL: constrained_vector_floor_v2f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq floor +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq floor +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_floor_v2f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vroundpd $9, {{.*}}(%rip), %xmm0 +; HAS-FMA-NEXT: retq +entry: + %floor = call <2 x double> @llvm.experimental.constrained.floor.v2f64( + <2 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <2 x double> %floor +} + +define <3 x float> @constrained_vector_floor_v3f32() { +; NO-FMA-LABEL: constrained_vector_floor_v3f32: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq floorf +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq floorf +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq floorf +; NO-FMA-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps %xmm1, %xmm0 +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_floor_v3f32: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; HAS-FMA-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 +; HAS-FMA-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; HAS-FMA-NEXT: vroundss $9, %xmm1, %xmm1, %xmm1 +; HAS-FMA-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; HAS-FMA-NEXT: vroundss $9, %xmm2, %xmm2, %xmm2 +; HAS-FMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; HAS-FMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; HAS-FMA-NEXT: retq +entry: + %floor = call <3 x float> @llvm.experimental.constrained.floor.v3f32( + <3 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <3 x float> %floor +} + +define <3 x double> @constrained_vector_floor_v3f64() {xentry: +; NO-FMA-LABEL: constrained_vector_floor_v3f64: +; NO-FMA: # %bb.0: # %xentry +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq floor +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq floor +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq floor +; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) +; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_floor_v3f64: +; HAS-FMA: # %bb.0: # %xentry +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 +; HAS-FMA-NEXT: vroundpd $9, {{.*}}(%rip), %xmm1 +; HAS-FMA-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; HAS-FMA-NEXT: retq + %floor = call <3 x double> @llvm.experimental.constrained.floor.v3f64( + <3 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <3 x double> %floor +} + ; Single width declarations declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double>, <2 x double>, metadata, metadata) @@ -3688,6 +3948,8 @@ declare <2 x double> @llvm.experimental.constrained.log2.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double>, metadata, metadata) +declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata, metadata) ; Scalar width declarations declare <1 x float> @llvm.experimental.constrained.fadd.v1f32(<1 x float>, <1 x float>, metadata, metadata) @@ -3708,6 +3970,8 @@ declare <1 x float> @llvm.experimental.constrained.log2.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.rint.v1f32(<1 x float>, metadata, metadata) declare <1 x float> @llvm.experimental.constrained.nearbyint.v1f32(<1 x float>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.ceil.v1f32(<1 x float>, metadata, metadata) +declare <1 x float> @llvm.experimental.constrained.floor.v1f32(<1 x float>, metadata, metadata) ; Illegal width declarations declare <3 x float> @llvm.experimental.constrained.fadd.v3f32(<3 x float>, <3 x float>, metadata, metadata) @@ -3746,6 +4010,10 @@ declare <3 x double> @llvm.experimental.constrained.rint.v3f64(<3 x double>, metadata, metadata) declare <3 x float> @llvm.experimental.constrained.nearbyint.v3f32(<3 x float>, metadata, metadata) declare <3 x double> @llvm.experimental.constrained.nearbyint.v3f64(<3 x double>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.ceil.v3f32(<3 x float>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.ceil.v3f64(<3 x double>, metadata, metadata) +declare <3 x float> @llvm.experimental.constrained.floor.v3f32(<3 x float>, metadata, metadata) +declare <3 x double> @llvm.experimental.constrained.floor.v3f64(<3 x double>, metadata, metadata) ; Double width declarations declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata) @@ -3767,3 +4035,5 @@ declare <4 x double> @llvm.experimental.constrained.log2.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.rint.v4f64(<4 x double>, metadata, metadata) declare <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.ceil.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, metadata, metadata)