Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -701,6 +701,7 @@ void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); Index: llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -755,6 +755,25 @@ case ISD::FMA: SplitVecRes_TernaryOp(N, Lo, Hi); break; + case ISD::STRICT_FADD: + case ISD::STRICT_FSUB: + case ISD::STRICT_FMUL: + case ISD::STRICT_FDIV: + case ISD::STRICT_FSQRT: + case ISD::STRICT_FMA: + case ISD::STRICT_FPOW: + case ISD::STRICT_FPOWI: + case ISD::STRICT_FSIN: + case ISD::STRICT_FCOS: + case ISD::STRICT_FEXP: + case ISD::STRICT_FEXP2: + case ISD::STRICT_FLOG: + case ISD::STRICT_FLOG10: + case ISD::STRICT_FLOG2: + case ISD::STRICT_FRINT: + case ISD::STRICT_FNEARBYINT: + SplitVecRes_StrictFPOp(N, Lo, Hi); + break; } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -1034,6 +1053,56 @@ Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi); } +void DAGTypeLegalizer::SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo, + SDValue &Hi) { + unsigned NumOps = N->getNumOperands(); + SDValue Chain = N->getOperand(0); + EVT LoVT, HiVT; + SDLoc dl(N); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + + SmallVector OpsLo; + SmallVector OpsHi; + + // The Chain is the first operand. + OpsLo.push_back(Chain); + OpsHi.push_back(Chain); + + // Now process the remaining operands. + for (unsigned i = 1; i < NumOps; ++i) { + SDValue Op = N->getOperand(i); + SDValue OpLo = Op; + SDValue OpHi = Op; + + EVT InVT = Op.getValueType(); + if (InVT.isVector()) { + // If the input also splits, handle it directly for a + // compile time speedup. Otherwise split it by hand. + if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) + GetSplitVector(Op, OpLo, OpHi); + else + std::tie(OpLo, OpHi) = DAG.SplitVectorOperand(N, i); + } + + OpsLo.push_back(OpLo); + OpsHi.push_back(OpHi); + } + + EVT LoValueVTs[] = {LoVT, MVT::Other}; + EVT HiValueVTs[] = {HiVT, MVT::Other}; + Lo = DAG.getNode(N->getOpcode(), dl, LoValueVTs, OpsLo); + Hi = DAG.getNode(N->getOpcode(), dl, HiValueVTs, OpsHi); + + // Build a factor node to remember that this Op is independent of the + // other one. + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Lo.getValue(1), Hi.getValue(1)); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(N, 1), Chain); +} + void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue Vec = N->getOperand(0); Index: llvm/trunk/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -2,14 +2,14 @@ ; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck --check-prefix=COMMON --check-prefix=NO-FMA --check-prefix=FMACALL64 --check-prefix=FMACALL32 %s ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck -check-prefix=COMMON --check-prefix=HAS-FMA --check-prefix=FMA64 --check-prefix=FMA32 %s -define <2 x double> @constrained_vector_fdiv() { -; NO-FMA-LABEL: constrained_vector_fdiv: +define <2 x double> @constrained_vector_fdiv_v2f64() { +; NO-FMA-LABEL: constrained_vector_fdiv_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: movapd {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00] ; NO-FMA-NEXT: divpd {{.*}}(%rip), %xmm0 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_fdiv: +; HAS-FMA-LABEL: constrained_vector_fdiv_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: vmovapd {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00] ; HAS-FMA-NEXT: vdivpd {{.*}}(%rip), %xmm0, %xmm0 @@ -23,14 +23,39 @@ ret <2 x double> %div } -define <2 x double> @constrained_vector_fmul(<2 x double> %a) { -; NO-FMA-LABEL: constrained_vector_fmul: +define <4 x double> @constrained_vector_fdiv_v4f64() { +; NO-FMA-LABEL: constrained_vector_fdiv_v4f64: +; NO-FMA: # %bb.0: +; NO-FMA-NEXT: movapd {{.*#+}} xmm2 = [1.000000e+01,1.000000e+01] +; NO-FMA-NEXT: movapd {{.*#+}} xmm1 = [3.000000e+00,4.000000e+00] +; NO-FMA-NEXT: divpd %xmm2, %xmm1 +; NO-FMA-NEXT: movapd {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00] +; NO-FMA-NEXT: divpd %xmm2, %xmm0 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_fdiv_v4f64: +; HAS-FMA: # %bb.0: +; HAS-FMA-NEXT: vmovapd {{.*#+}} ymm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] +; HAS-FMA-NEXT: vdivpd {{.*}}(%rip), %ymm0, %ymm0 +; HAS-FMA-NEXT: retq + %div = call <4 x double> @llvm.experimental.constrained.fdiv.v4f64( + <4 x double> , + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %div +} + +define <2 x double> @constrained_vector_fmul_v2f64() { +; NO-FMA-LABEL: constrained_vector_fmul_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: movapd {{.*#+}} xmm0 = [1.797693e+308,1.797693e+308] ; NO-FMA-NEXT: mulpd {{.*}}(%rip), %xmm0 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_fmul: +; HAS-FMA-LABEL: constrained_vector_fmul_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: vmovapd {{.*#+}} xmm0 = [1.797693e+308,1.797693e+308] ; HAS-FMA-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 @@ -44,14 +69,40 @@ ret <2 x double> %mul } -define <2 x double> @constrained_vector_fadd() { -; NO-FMA-LABEL: constrained_vector_fadd: +define <4 x double> @constrained_vector_fmul_v4f64() { +; NO-FMA-LABEL: constrained_vector_fmul_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: movapd {{.*#+}} xmm0 = [1.797693e+308,1.797693e+308] +; NO-FMA-NEXT: movapd {{.*#+}} xmm1 = [4.000000e+00,5.000000e+00] +; NO-FMA-NEXT: mulpd %xmm0, %xmm1 +; NO-FMA-NEXT: mulpd {{.*}}(%rip), %xmm0 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_fmul_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vmovapd {{.*#+}} ymm0 = [1.797693e+308,1.797693e+308,1.797693e+308,1.797693e+308] +; HAS-FMA-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 +; HAS-FMA-NEXT: retq +entry: + %mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64( + <4 x double> , + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %mul +} + + +define <2 x double> @constrained_vector_fadd_v2f64() { +; NO-FMA-LABEL: constrained_vector_fadd_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: movapd {{.*#+}} xmm0 = [1.797693e+308,1.797693e+308] ; NO-FMA-NEXT: addpd {{.*}}(%rip), %xmm0 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_fadd: +; HAS-FMA-LABEL: constrained_vector_fadd_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: vmovapd {{.*#+}} xmm0 = [1.797693e+308,1.797693e+308] ; HAS-FMA-NEXT: vaddpd {{.*}}(%rip), %xmm0, %xmm0 @@ -65,14 +116,39 @@ ret <2 x double> %add } -define <2 x double> @constrained_vector_fsub() { -; NO-FMA-LABEL: constrained_vector_fsub: +define <4 x double> @constrained_vector_fadd_v4f64() { +; NO-FMA-LABEL: constrained_vector_fadd_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: movapd {{.*#+}} xmm0 = [1.797693e+308,1.797693e+308] +; NO-FMA-NEXT: movapd {{.*#+}} xmm1 = [2.000000e+00,2.000000e-01] +; NO-FMA-NEXT: addpd %xmm0, %xmm1 +; NO-FMA-NEXT: addpd {{.*}}(%rip), %xmm0 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_fadd_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vmovapd {{.*#+}} ymm0 = [1.797693e+308,1.797693e+308,1.797693e+308,1.797693e+308] +; HAS-FMA-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0 +; HAS-FMA-NEXT: retq +entry: + %add = call <4 x double> @llvm.experimental.constrained.fadd.v4f64( + <4 x double> , + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %add +} + +define <2 x double> @constrained_vector_fsub_v2f64() { +; NO-FMA-LABEL: constrained_vector_fsub_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: movapd {{.*#+}} xmm0 = [-1.797693e+308,-1.797693e+308] ; NO-FMA-NEXT: subpd {{.*}}(%rip), %xmm0 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_fsub: +; HAS-FMA-LABEL: constrained_vector_fsub_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: vmovapd {{.*#+}} xmm0 = [-1.797693e+308,-1.797693e+308] ; HAS-FMA-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 @@ -86,6 +162,31 @@ ret <2 x double> %sub } +define <4 x double> @constrained_vector_fsub_v4f64() { +; NO-FMA-LABEL: constrained_vector_fsub_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: movapd {{.*#+}} xmm0 = [-1.797693e+308,-1.797693e+308] +; NO-FMA-NEXT: movapd %xmm0, %xmm1 +; NO-FMA-NEXT: subpd {{.*}}(%rip), %xmm1 +; NO-FMA-NEXT: subpd {{.*}}(%rip), %xmm0 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_fsub_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vmovapd {{.*#+}} ymm0 = [-1.797693e+308,-1.797693e+308,-1.797693e+308,-1.797693e+308] +; HAS-FMA-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 +; HAS-FMA-NEXT: retq +entry: + %sub = call <4 x double> @llvm.experimental.constrained.fsub.v4f64( + <4 x double> , + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %sub +} + define <2 x double> @constrained_vector_fma_v2f64() { ; NO-FMA-LABEL: constrained_vector_fma_v2f64: ; NO-FMA: # %bb.0: # %entry @@ -122,6 +223,56 @@ ret <2 x double> %fma } +define <4 x double> @constrained_vector_fma_v4f64() { +; NO-FMA-LABEL: constrained_vector_fma_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; NO-FMA-NEXT: callq fma +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; NO-FMA-NEXT: callq fma +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; NO-FMA-NEXT: callq fma +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; NO-FMA-NEXT: callq fma +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_fma_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vmovapd {{.*#+}} ymm1 = [3.500000e+00,2.500000e+00,1.500000e+00,5.000000e-01] +; HAS-FMA-NEXT: vmovapd {{.*#+}} ymm0 = [7.500000e+00,6.500000e+00,5.500000e+00,4.500000e+00] +; HAS-FMA-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + mem +; HAS-FMA-NEXT: retq +entry: + %fma = call <4 x double> @llvm.experimental.constrained.fma.v4f64( + <4 x double> , + <4 x double> , + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %fma +} + define <4 x float> @constrained_vector_fma_v4f32() { ; NO-FMA-LABEL: constrained_vector_fma_v4f32: ; NO-FMA: # %bb.0: # %entry @@ -172,13 +323,94 @@ ret <4 x float> %fma } -define <2 x double> @constrained_vector_sqrt() { -; NO-FMA-LABEL: constrained_vector_sqrt: +define <8 x float> @constrained_vector_fma_v8f32() { +; NO-FMA-LABEL: constrained_vector_fma_v8f32: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq fmaf +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq fmaf +; NO-FMA-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq fmaf +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq fmaf +; NO-FMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq fmaf +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq fmaf +; NO-FMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq fmaf +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; NO-FMA-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; NO-FMA-NEXT: callq fmaf +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_fma_v8f32: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vmovaps {{.*#+}} ymm1 = [3.500000e+00,2.500000e+00,1.500000e+00,5.000000e-01,7.500000e+00,6.500000e+00,5.500000e+00,4.500000e+00] +; HAS-FMA-NEXT: vmovaps {{.*#+}} ymm0 = [7.500000e+00,6.500000e+00,5.500000e+00,4.500000e+00,1.150000e+01,1.050000e+01,9.500000e+00,8.500000e+00] +; HAS-FMA-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + mem +; HAS-FMA-NEXT: retq +entry: + %fma = call <8 x float> @llvm.experimental.constrained.fma.v8f32( + <8 x float> , + <8 x float> , + <8 x float> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <8 x float> %fma +} + +define <2 x double> @constrained_vector_sqrt_v2f64() { +; NO-FMA-LABEL: constrained_vector_sqrt_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: sqrtpd {{.*}}(%rip), %xmm0 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_sqrt: +; HAS-FMA-LABEL: constrained_vector_sqrt_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: vsqrtpd {{.*}}(%rip), %xmm0 ; HAS-FMA-NEXT: retq @@ -190,8 +422,28 @@ ret <2 x double> %sqrt } -define <2 x double> @constrained_vector_pow() { -; NO-FMA-LABEL: constrained_vector_pow: +define <4 x double> @constrained_vector_sqrt_v4f64() { +; NO-FMA-LABEL: constrained_vector_sqrt_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: sqrtpd {{.*}}(%rip), %xmm1 +; NO-FMA-NEXT: sqrtpd {{.*}}(%rip), %xmm0 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_sqrt_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vsqrtpd {{.*}}(%rip), %ymm0 +; HAS-FMA-NEXT: retq +entry: + %sqrt = call <4 x double> @llvm.experimental.constrained.sqrt.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %sqrt +} + +define <2 x double> @constrained_vector_pow_v2f64() { +; NO-FMA-LABEL: constrained_vector_pow_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: subq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -208,7 +460,7 @@ ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_pow: +; HAS-FMA-LABEL: constrained_vector_pow_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: subq $24, %rsp ; HAS-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -233,8 +485,76 @@ ret <2 x double> %pow } -define <2 x double> @constrained_vector_powi() { -; NO-FMA-LABEL: constrained_vector_powi: +define <4 x double> @constrained_vector_pow_v4f64() { +; NO-FMA-LABEL: constrained_vector_pow_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NO-FMA-NEXT: callq pow +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NO-FMA-NEXT: callq pow +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NO-FMA-NEXT: callq pow +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; NO-FMA-NEXT: callq pow +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_pow_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: subq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 48 +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; HAS-FMA-NEXT: callq pow +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; HAS-FMA-NEXT: callq pow +; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; HAS-FMA-NEXT: callq pow +; HAS-FMA-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; HAS-FMA-NEXT: callq pow +; HAS-FMA-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: addq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 +; HAS-FMA-NEXT: retq +entry: + %pow = call <4 x double> @llvm.experimental.constrained.pow.v4f64( + <4 x double> , + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %pow +} + +define <2 x double> @constrained_vector_powi_v2f64() { +; NO-FMA-LABEL: constrained_vector_powi_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: subq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -251,7 +571,7 @@ ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_powi: +; HAS-FMA-LABEL: constrained_vector_powi_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: subq $24, %rsp ; HAS-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -276,86 +596,270 @@ ret <2 x double> %powi } -define <2 x double> @constrained_vector_sin() { -; NO-FMA-LABEL: constrained_vector_sin: +define <4 x double> @constrained_vector_powi_v4f64() { +; NO-FMA-LABEL: constrained_vector_powi_v4f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; NO-FMA-NEXT: callq sin +; NO-FMA-NEXT: movl $3, %edi +; NO-FMA-NEXT: callq __powidf2 ; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; NO-FMA-NEXT: callq sin +; NO-FMA-NEXT: movl $3, %edi +; NO-FMA-NEXT: callq __powidf2 ; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: movl $3, %edi +; NO-FMA-NEXT: callq __powidf2 +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: movl $3, %edi +; NO-FMA-NEXT: callq __powidf2 +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_sin: +; HAS-FMA-LABEL: constrained_vector_powi_v4f64: ; HAS-FMA: # %bb.0: # %entry -; HAS-FMA-NEXT: subq $24, %rsp -; HAS-FMA-NEXT: .cfi_def_cfa_offset 32 +; HAS-FMA-NEXT: subq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 48 ; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; HAS-FMA-NEXT: callq sin +; HAS-FMA-NEXT: movl $3, %edi +; HAS-FMA-NEXT: callq __powidf2 ; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; HAS-FMA-NEXT: callq sin +; HAS-FMA-NEXT: movl $3, %edi +; HAS-FMA-NEXT: callq __powidf2 ; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; HAS-FMA-NEXT: addq $24, %rsp +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: movl $3, %edi +; HAS-FMA-NEXT: callq __powidf2 +; HAS-FMA-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: movl $3, %edi +; HAS-FMA-NEXT: callq __powidf2 +; HAS-FMA-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: addq $40, %rsp ; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 ; HAS-FMA-NEXT: retq entry: - %sin = call <2 x double> @llvm.experimental.constrained.sin.v2f64( - <2 x double> , - metadata !"round.dynamic", - metadata !"fpexcept.strict") - ret <2 x double> %sin + %powi = call <4 x double> @llvm.experimental.constrained.powi.v4f64( + <4 x double> , + i32 3, + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %powi } -define <2 x double> @constrained_vector_cos() { -; NO-FMA-LABEL: constrained_vector_cos: + +define <2 x double> @constrained_vector_sin_v2f64() { +; NO-FMA-LABEL: constrained_vector_sin_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: subq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; NO-FMA-NEXT: callq cos +; NO-FMA-NEXT: callq sin ; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; NO-FMA-NEXT: callq cos +; NO-FMA-NEXT: callq sin ; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] ; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_cos: +; HAS-FMA-LABEL: constrained_vector_sin_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: subq $24, %rsp ; HAS-FMA-NEXT: .cfi_def_cfa_offset 32 ; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; HAS-FMA-NEXT: callq cos +; HAS-FMA-NEXT: callq sin ; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; HAS-FMA-NEXT: callq cos +; HAS-FMA-NEXT: callq sin ; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] ; HAS-FMA-NEXT: addq $24, %rsp ; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 ; HAS-FMA-NEXT: retq entry: - %cos = call <2 x double> @llvm.experimental.constrained.cos.v2f64( + %sin = call <2 x double> @llvm.experimental.constrained.sin.v2f64( <2 x double> , metadata !"round.dynamic", metadata !"fpexcept.strict") - ret <2 x double> %cos + ret <2 x double> %sin } -define <2 x double> @constrained_vector_exp() { -; NO-FMA-LABEL: constrained_vector_exp: +define <4 x double> @constrained_vector_sin_v4f64() { +; NO-FMA-LABEL: constrained_vector_sin_v4f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq sin +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq sin +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq sin +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq sin +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_sin_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: subq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 48 +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq sin +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq sin +; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq sin +; HAS-FMA-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq sin +; HAS-FMA-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: addq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 +; HAS-FMA-NEXT: retq +entry: + %sin = call <4 x double> @llvm.experimental.constrained.sin.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %sin +} + +define <2 x double> @constrained_vector_cos_v2f64() { +; NO-FMA-LABEL: constrained_vector_cos_v2f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq cos +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq cos +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_cos_v2f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: subq $24, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 32 +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq cos +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq cos +; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: addq $24, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 +; HAS-FMA-NEXT: retq +entry: + %cos = call <2 x double> @llvm.experimental.constrained.cos.v2f64( + <2 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <2 x double> %cos +} + +define <4 x double> @constrained_vector_cos_v4f64() { +; NO-FMA-LABEL: constrained_vector_cos_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq cos +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq cos +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq cos +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq cos +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_cos_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: subq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 48 +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq cos +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq cos +; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq cos +; HAS-FMA-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq cos +; HAS-FMA-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: addq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 +; HAS-FMA-NEXT: retq +entry: + %cos = call <4 x double> @llvm.experimental.constrained.cos.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %cos +} + +define <2 x double> @constrained_vector_exp_v2f64() { +; NO-FMA-LABEL: constrained_vector_exp_v2f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp @@ -368,7 +872,7 @@ ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_exp: +; HAS-FMA-LABEL: constrained_vector_exp_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: subq $24, %rsp ; HAS-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -390,8 +894,66 @@ ret <2 x double> %exp } -define <2 x double> @constrained_vector_exp2() { -; NO-FMA-LABEL: constrained_vector_exp2: +define <4 x double> @constrained_vector_exp_v4f64() { +; NO-FMA-LABEL: constrained_vector_exp_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq exp +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq exp +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq exp +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq exp +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_exp_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: subq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 48 +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq exp +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq exp +; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq exp +; HAS-FMA-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq exp +; HAS-FMA-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: addq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 +; HAS-FMA-NEXT: retq +entry: + %exp = call <4 x double> @llvm.experimental.constrained.exp.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %exp +} + +define <2 x double> @constrained_vector_exp2_v2f64() { +; NO-FMA-LABEL: constrained_vector_exp2_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: subq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -406,7 +968,7 @@ ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_exp2: +; HAS-FMA-LABEL: constrained_vector_exp2_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: subq $24, %rsp ; HAS-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -428,8 +990,66 @@ ret <2 x double> %exp2 } -define <2 x double> @constrained_vector_log() { -; NO-FMA-LABEL: constrained_vector_log: +define <4 x double> @constrained_vector_exp2_v4f64() { +; NO-FMA-LABEL: constrained_vector_exp2_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq exp2 +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq exp2 +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq exp2 +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq exp2 +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_exp2_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: subq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 48 +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq exp2 +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq exp2 +; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq exp2 +; HAS-FMA-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq exp2 +; HAS-FMA-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: addq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 +; HAS-FMA-NEXT: retq +entry: + %exp2 = call <4 x double> @llvm.experimental.constrained.exp2.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %exp2 +} + +define <2 x double> @constrained_vector_log_v2f64() { +; NO-FMA-LABEL: constrained_vector_log_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: subq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -444,7 +1064,7 @@ ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_log: +; HAS-FMA-LABEL: constrained_vector_log_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: subq $24, %rsp ; HAS-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -466,8 +1086,66 @@ ret <2 x double> %log } -define <2 x double> @constrained_vector_log10() { -; NO-FMA-LABEL: constrained_vector_log10: +define <4 x double> @constrained_vector_log_v4f64() { +; NO-FMA-LABEL: constrained_vector_log_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_log_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: subq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 48 +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log +; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log +; HAS-FMA-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log +; HAS-FMA-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: addq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 +; HAS-FMA-NEXT: retq +entry: + %log = call <4 x double> @llvm.experimental.constrained.log.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %log +} + +define <2 x double> @constrained_vector_log10_v2f64() { +; NO-FMA-LABEL: constrained_vector_log10_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: subq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -482,7 +1160,7 @@ ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_log10: +; HAS-FMA-LABEL: constrained_vector_log10_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: subq $24, %rsp ; HAS-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -504,8 +1182,66 @@ ret <2 x double> %log10 } -define <2 x double> @constrained_vector_log2() { -; NO-FMA-LABEL: constrained_vector_log2: +define <4 x double> @constrained_vector_log10_v4f64() { +; NO-FMA-LABEL: constrained_vector_log10_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log10 +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log10 +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log10 +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log10 +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_log10_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: subq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 48 +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log10 +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log10 +; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log10 +; HAS-FMA-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log10 +; HAS-FMA-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: addq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 +; HAS-FMA-NEXT: retq +entry: + %log10 = call <4 x double> @llvm.experimental.constrained.log10.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %log10 +} + +define <2 x double> @constrained_vector_log2_v2f64() { +; NO-FMA-LABEL: constrained_vector_log2_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: subq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -520,7 +1256,7 @@ ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_log2: +; HAS-FMA-LABEL: constrained_vector_log2_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: subq $24, %rsp ; HAS-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -542,8 +1278,66 @@ ret <2 x double> %log2 } -define <2 x double> @constrained_vector_rint() { -; NO-FMA-LABEL: constrained_vector_rint: +define <4 x double> @constrained_vector_log2_v4f64() { +; NO-FMA-LABEL: constrained_vector_log2_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log2 +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log2 +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log2 +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq log2 +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_log2_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: subq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 48 +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log2 +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log2 +; HAS-FMA-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log2 +; HAS-FMA-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; HAS-FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; HAS-FMA-NEXT: callq log2 +; HAS-FMA-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; HAS-FMA-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload +; HAS-FMA-NEXT: addq $40, %rsp +; HAS-FMA-NEXT: .cfi_def_cfa_offset 8 +; HAS-FMA-NEXT: retq +entry: + %log2 = call <4 x double> @llvm.experimental.constrained.log2.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %log2 +} + +define <2 x double> @constrained_vector_rint_v2f64() { +; NO-FMA-LABEL: constrained_vector_rint_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: subq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -558,7 +1352,7 @@ ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_rint: +; HAS-FMA-LABEL: constrained_vector_rint_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: vroundpd $4, {{.*}}(%rip), %xmm0 ; HAS-FMA-NEXT: retq @@ -570,8 +1364,47 @@ ret <2 x double> %rint } -define <2 x double> @constrained_vector_nearbyint() { -; NO-FMA-LABEL: constrained_vector_nearbyint: +define <4 x double> @constrained_vector_rint_v4f64() { +; NO-FMA-LABEL: constrained_vector_rint_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq rint +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq rint +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq rint +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq rint +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_rint_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vroundpd $4, {{.*}}(%rip), %ymm0 +; HAS-FMA-NEXT: retq +entry: + %rint = call <4 x double> @llvm.experimental.constrained.rint.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %rint +} + +define <2 x double> @constrained_vector_nearbyint_v2f64() { +; NO-FMA-LABEL: constrained_vector_nearbyint_v2f64: ; NO-FMA: # %bb.0: # %entry ; NO-FMA-NEXT: subq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 32 @@ -586,7 +1419,7 @@ ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; -; HAS-FMA-LABEL: constrained_vector_nearbyint: +; HAS-FMA-LABEL: constrained_vector_nearbyint_v2f64: ; HAS-FMA: # %bb.0: # %entry ; HAS-FMA-NEXT: vroundpd $12, {{.*}}(%rip), %xmm0 ; HAS-FMA-NEXT: retq @@ -598,7 +1431,46 @@ ret <2 x double> %nearby } +define <4 x double> @constrained_vector_nearbyint_v4f64() { +; NO-FMA-LABEL: constrained_vector_nearbyint_v4f64: +; NO-FMA: # %bb.0: # %entry +; NO-FMA-NEXT: subq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 48 +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq nearbyint +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq nearbyint +; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] +; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq nearbyint +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; NO-FMA-NEXT: callq nearbyint +; NO-FMA-NEXT: movaps %xmm0, %xmm1 +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; NO-FMA-NEXT: # xmm1 = xmm1[0],mem[0] +; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: addq $40, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 8 +; NO-FMA-NEXT: retq +; +; HAS-FMA-LABEL: constrained_vector_nearbyint_v4f64: +; HAS-FMA: # %bb.0: # %entry +; HAS-FMA-NEXT: vroundpd $12, {{.*}}(%rip), %ymm0 +; HAS-FMA-NEXT: retq +entry: + %nearby = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64( + <4 x double> , + metadata !"round.dynamic", + metadata !"fpexcept.strict") + ret <4 x double> %nearby +} +; Single width declarations declare <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double>, <2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double>, <2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata) @@ -617,3 +1489,23 @@ declare <2 x double> @llvm.experimental.constrained.log2.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.rint.v2f64(<2 x double>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double>, metadata, metadata) + +; Double width declarations +declare <4 x double> @llvm.experimental.constrained.fdiv.v4f64(<4 x double>, <4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.fmul.v4f64(<4 x double>, <4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.fadd.v4f64(<4 x double>, <4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.fsub.v4f64(<4 x double>, <4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double>, <4 x double>, <4 x double>, metadata, metadata) +declare <8 x float> @llvm.experimental.constrained.fma.v8f32(<8 x float>, <8 x float>, <8 x float>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.sqrt.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.pow.v4f64(<4 x double>, <4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.sin.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.cos.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.exp.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.exp2.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.log.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.log10.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.log2.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.rint.v4f64(<4 x double>, metadata, metadata) +declare <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double>, metadata, metadata)