Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3953,6 +3953,16 @@ VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts); } + // If the widened op is not legal and the scalar variants + // will be turned into libcalls, unroll the op to prevent + // widening so that we don't end up with libcalls on + // undef elements. + if (!TLI.isOperationLegal(Opcode, VT) && + TLI.getOperationAction(Opcode, VT.getScalarType()) == + TargetLowering::LibCall) { + return DAG.UnrollVectorOp(N, VT.getVectorNumElements()); + } + if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) { // Operation doesn't trap so just widen as normal. SDValue InOp1 = GetWidenedVector(N->getOperand(0)); Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -354,8 +354,8 @@ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); - setOperationAction(ISD::FREM , MVT::f32 , Expand); - setOperationAction(ISD::FREM , MVT::f64 , Expand); + setOperationAction(ISD::FREM , MVT::f32 , LibCall); + setOperationAction(ISD::FREM , MVT::f64 , LibCall); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FREM , MVT::f128 , Expand); @@ -2195,19 +2195,24 @@ // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. if (Subtarget.is32Bit() && - (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) - for (ISD::NodeType Op : + (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) { + for (ISD::NodeType Op : {ISD::FCEIL, ISD::STRICT_FCEIL, ISD::FCOS, ISD::STRICT_FCOS, ISD::FEXP, ISD::STRICT_FEXP, ISD::FFLOOR, ISD::STRICT_FFLOOR, - ISD::FREM, ISD::STRICT_FREM, ISD::FLOG, ISD::STRICT_FLOG, ISD::FLOG10, ISD::STRICT_FLOG10, ISD::FPOW, ISD::STRICT_FPOW, ISD::FSIN, ISD::STRICT_FSIN}) - if (isOperationExpand(Op, MVT::f32)) - setOperationAction(Op, MVT::f32, Promote); + if (isOperationExpand(Op, MVT::f32)) + setOperationAction(Op, MVT::f32, Promote); + + // The default action for f32 FREMs on X86 is LibCall + // so these need to be explicitly changed to Promote. + setOperationAction(ISD::FREM, MVT::f32, Promote); + setOperationAction(ISD::STRICT_FREM, MVT::f32, Promote); + } // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::VECTOR_SHUFFLE, Index: llvm/test/Analysis/CostModel/X86/arith-fp.ll =================================================================== --- llvm/test/Analysis/CostModel/X86/arith-fp.ll +++ llvm/test/Analysis/CostModel/X86/arith-fp.ll @@ -628,80 +628,80 @@ define i32 @frem(i32 %arg) { ; SSE1-LABEL: 'frem' -; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = frem <4 x double> undef, undef -; SSE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = frem <8 x double> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = frem <2 x double> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = frem <4 x double> undef, undef +; SSE1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8F64 = frem <8 x double> undef, undef ; SSE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE2-LABEL: 'frem' -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef +; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F64 = frem <8 x double> undef, undef ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SSE42-LABEL: 'frem' -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F32 = frem <8 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F32 = frem <16 x float> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef +; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F64 = frem <8 x double> undef, undef ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX-LABEL: 'frem' -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef -; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = frem <8 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8F32 = frem <8 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %V16F32 = frem <16 x float> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4F64 = frem <4 x double> undef, undef +; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V8F64 = frem <8 x double> undef, undef ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'frem' -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16F32 = frem <16 x float> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4F64 = frem <4 x double> undef, undef -; AVX512-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V8F64 = frem <8 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V8F32 = frem <8 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V16F32 = frem <16 x float> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4F64 = frem <4 x double> undef, undef +; AVX512-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V8F64 = frem <8 x double> undef, undef ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'frem' -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F32 = frem <8 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F32 = frem <16 x float> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef +; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F64 = frem <8 x double> undef, undef ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; GLM-LABEL: 'frem' -; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef -; GLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64 = frem <8 x double> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F32 = frem float undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V4F32 = frem <4 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F32 = frem <8 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F32 = frem <16 x float> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %F64 = frem double undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef +; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F64 = frem <8 x double> undef, undef ; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %F32 = frem float undef, undef Index: llvm/test/CodeGen/X86/frem-libcall.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/frem-libcall.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Ensure vectorized FREMs are not widened/unrolled such that they get lowered +; into libcalls on undef elements. + +; RUN: llc -mtriple=x86_64-linux-gnu < %s | FileCheck %s + +define float @frem(<2 x float> %a0, <2 x float> %a1, <2 x float> %a2, <2 x float> *%p3) nounwind { +; CHECK-LABEL: frem: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: divps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1] +; CHECK-NEXT: addss %xmm1, %xmm0 +; CHECK-NEXT: movlps %xmm1, (%rbx) +; CHECK-NEXT: addq $64, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem <2 x float> %a0, %a1 + %fdiv = fdiv <2 x float> %frem, %a2 + %ex0 = extractelement <2 x float> %fdiv, i32 0 + %ex1 = extractelement <2 x float> %fdiv, i32 1 + %res = fadd float %ex0, %ex1 + store <2 x float> %fdiv, <2 x float> *%p3 + ret float %res +} + Index: llvm/test/CodeGen/X86/frem.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/frem.ll @@ -0,0 +1,1462 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Basic test coverage for FREM + +; RUN: llc -mtriple=x86_64-linux-gnu < %s | FileCheck %s + +define void @frem_f16(half %a0, half %a1, half *%p3) nounwind { +; CHECK-LABEL: frem_f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: movq %rdx, %rbx +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: movzwl %si, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl %bp, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, (%rbx) +; CHECK-NEXT: addq $8, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq + %frem = frem half %a0, %a1 + store half %frem, half *%p3 + ret void +} + +define void @frem_f32(float %a0, float %a1, float *%p3) nounwind { +; CHECK-LABEL: frem_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movss %xmm0, (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem float %a0, %a1 + store float %frem, float *%p3 + ret void +} + +define void @frem_f64(double %a0, double %a1, double *%p3) nounwind { +; CHECK-LABEL: frem_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movsd %xmm0, (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem double %a0, %a1 + store double %frem, double *%p3 + ret void +} + +define void @frem_f80(x86_fp80 %a0, x86_fp80 %a1, x86_fp80 *%p3) nounwind { +; CHECK-LABEL: frem_f80: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $32, %rsp +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq fmodl@PLT +; CHECK-NEXT: fstpt (%rbx) +; CHECK-NEXT: addq $32, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem x86_fp80 %a0, %a1 + store x86_fp80 %frem, x86_fp80 *%p3 + ret void +} + +define void @frem_f128(fp128 %a0, fp128 %a1, fp128 *%p3) nounwind { +; CHECK-LABEL: frem_f128: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: callq fmodl@PLT +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem fp128 %a0, %a1 + store fp128 %frem, fp128 *%p3 + ret void +} + +define void @frem_v16f32(<16 x float> %a0, <16 x float> %a1, <16 x float> *%p3) nounwind { +; CHECK-LABEL: frem_v16f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $160, %rsp +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps %xmm4, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps %xmm1, 48(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, 32(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: addq $160, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem <16 x float> %a0, %a1 + store <16 x float> %frem, <16 x float> *%p3 + ret void +} + +define void @frem_v8f32(<8 x float> %a0, <8 x float> %a1, <8 x float> *%p3) nounwind { +; CHECK-LABEL: frem_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $96, %rsp +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps %xmm1, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: addq $96, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem <8 x float> %a0, %a1 + store <8 x float> %frem, <8 x float> *%p3 + ret void +} + +define void @fremfv4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> *%p3) nounwind { +; CHECK-LABEL: fremfv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movaps %xmm1, (%rbx) +; CHECK-NEXT: addq $64, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem <4 x float> %a0, %a1 + store <4 x float> %frem, <4 x float> *%p3 + ret void +} + +define void @frem_v8f64(<8 x double> %a0, <8 x double> %a1, <8 x double> *%p3) nounwind { +; CHECK-LABEL: frem_v8f64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $144, %rsp +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm4, %xmm1 +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, 48(%rbx) +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, 32(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: addq $144, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem <8 x double> %a0, %a1 + store <8 x double> %frem, <8 x double> *%p3 + ret void +} + +define void @frem_v4f64(<4 x double> %a0, <4 x double> %a1, <4 x double> *%p3) nounwind { +; CHECK-LABEL: frem_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $80, %rsp +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, 16(%rbx) +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: movaps %xmm0, (%rbx) +; CHECK-NEXT: addq $80, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem <4 x double> %a0, %a1 + store <4 x double> %frem, <4 x double> *%p3 + ret void +} + +define void @frem_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> *%p3) nounwind { +; CHECK-LABEL: frem_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $48, %rsp +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: callq fmod@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-NEXT: movaps %xmm1, (%rbx) +; CHECK-NEXT: addq $48, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem <2 x double> %a0, %a1 + store <2 x double> %frem, <2 x double> *%p3 + ret void +} + +define void @frem_v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> *%p3) nounwind { +; CHECK-LABEL: frem_v32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $248, %rsp +; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %esi, %r14d +; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl %r12d, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl %r14w, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl %r13d, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %ebp +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r14d +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r12d +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r13d +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %ebx +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, 62(%r15) +; CHECK-NEXT: movw %bx, 60(%r15) +; CHECK-NEXT: movw %r13w, 58(%r15) +; CHECK-NEXT: movw %r12w, 56(%r15) +; CHECK-NEXT: movw %r14w, 54(%r15) +; CHECK-NEXT: movw %bp, 52(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 50(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 48(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 46(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 44(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 42(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 40(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 38(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 36(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 34(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 32(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 30(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 28(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 26(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 24(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 22(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 20(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 18(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 16(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 14(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 12(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 10(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 8(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 6(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 4(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 2(%r15) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, (%r15) +; CHECK-NEXT: addq $248, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq + %frem = frem <32 x half> %a0, %a1 + store <32 x half> %frem, <32 x half> *%p3 + ret void +} + +define void @frem_v16f16(<16 x half> %a0, <16 x half> %a1, <16 x half> *%p3) nounwind { +; CHECK-LABEL: frem_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $120, %rsp +; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %esi, %r15d +; CHECK-NEXT: movl %edi, %r14d +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbp +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r13d +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl %r14w, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl %ebx, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl %r15w, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl %r12d, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl %r13d, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r13d +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r12d +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %ebx +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r14d +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r15d +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, 30(%rbp) +; CHECK-NEXT: movw %r15w, 28(%rbp) +; CHECK-NEXT: movw %r14w, 26(%rbp) +; CHECK-NEXT: movw %bx, 24(%rbp) +; CHECK-NEXT: movw %r12w, 22(%rbp) +; CHECK-NEXT: movw %r13w, 20(%rbp) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 18(%rbp) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 16(%rbp) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 14(%rbp) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 12(%rbp) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 10(%rbp) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 8(%rbp) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 6(%rbp) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 4(%rbp) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, 2(%rbp) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, (%rbp) +; CHECK-NEXT: addq $120, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq + %frem = frem <16 x half> %a0, %a1 + store <16 x half> %frem, <16 x half> *%p3 + ret void +} + +define void @frem_v8f16(<8 x half> %a0, <8 x half> %a1, <8 x half> *%p3) nounwind { +; CHECK-LABEL: frem_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $56, %rsp +; CHECK-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl %edx, %ebx +; CHECK-NEXT: movl %esi, %r13d +; CHECK-NEXT: movl %edi, %r15d +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d +; CHECK-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl %r15w, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movl %r12d, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl %r13w, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r12d +; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl %bx, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r13d +; CHECK-NEXT: movl %r14d, %edi +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r14d +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %r15d +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 2-byte Folded Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %ebp +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movl %eax, %ebx +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload +; CHECK-NEXT: callq __gnu_h2f_ieee@PLT +; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; CHECK-NEXT: # xmm1 = mem[0],zero,zero,zero +; CHECK-NEXT: callq fmodf@PLT +; CHECK-NEXT: callq __gnu_f2h_ieee@PLT +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; CHECK-NEXT: movw %ax, 14(%rcx) +; CHECK-NEXT: movw %bx, 12(%rcx) +; CHECK-NEXT: movw %bp, 10(%rcx) +; CHECK-NEXT: movw %r15w, 8(%rcx) +; CHECK-NEXT: movw %r14w, 6(%rcx) +; CHECK-NEXT: movw %r13w, 4(%rcx) +; CHECK-NEXT: movw %r12w, 2(%rcx) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, (%rcx) +; CHECK-NEXT: addq $56, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: retq + %frem = frem <8 x half> %a0, %a1 + store <8 x half> %frem, <8 x half> *%p3 + ret void +} + +define void @frem_v4f80(<4 x x86_fp80> %a0, <4 x x86_fp80> %a1, <4 x x86_fp80> *%p3) nounwind { +; CHECK-LABEL: frem_v4f80: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $128, %rsp +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq fmodl@PLT +; CHECK-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq fmodl@PLT +; CHECK-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq fmodl@PLT +; CHECK-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Spill +; CHECK-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload +; CHECK-NEXT: fstpt {{[0-9]+}}(%rsp) +; CHECK-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq fmodl@PLT +; CHECK-NEXT: fstpt 30(%rbx) +; CHECK-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload +; CHECK-NEXT: fstpt 20(%rbx) +; CHECK-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload +; CHECK-NEXT: fstpt 10(%rbx) +; CHECK-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) # 10-byte Folded Reload +; CHECK-NEXT: fstpt (%rbx) +; CHECK-NEXT: addq $128, %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq + %frem = frem <4 x x86_fp80> %a0, %a1 + store <4 x x86_fp80> %frem, <4 x x86_fp80> *%p3 + ret void +}