diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -780,9 +780,12 @@ // subvector broadcast from memory. SUBV_BROADCAST_LOAD, - // Store FP control world into i16 memory. + // Store FP control word into i16 memory. FNSTCW16m, + // Load FP control word from i16 memory. + FLDCW16m, + /// This instruction implements FP_TO_SINT with the /// integer destination in memory and a FP reg source. This corresponds /// to the X86::FIST*m instructions and the rounding mode change stuff. It @@ -1521,6 +1524,9 @@ SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGET_FPMODE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSET_FPMODE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRESET_FPMODE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -359,6 +359,11 @@ setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FREM , MVT::f128 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + setOperationAction(ISD::GET_FPMODE , MVT::i64 , Custom); + setOperationAction(ISD::GET_FPMODE , MVT::i32 , Custom); + setOperationAction(ISD::SET_FPMODE , MVT::i64 , Custom); + setOperationAction(ISD::SET_FPMODE , MVT::i32 , Custom); + setOperationAction(ISD::RESET_FPMODE , MVT::Other, Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. @@ -26940,6 +26945,128 @@ return DAG.getMergeValues({RetVal, Chain}, DL); } +SDValue X86TargetLowering::LowerGET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + unsigned StackSize = Subtarget.hasSSE1() ? 8 : 2; + Align StackAlign(StackSize); + + // Save FP Control Word to stack slot + int SSFI = MF.getFrameInfo().CreateStackObject(StackSize, StackAlign, false); + SDValue StackSlot = + DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); + + SDValue Ops[] = {Chain, StackSlot}; + Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, + DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI, + StackAlign, MachineMemOperand::MOStore); + + // Load FP Control Word from stack slot + SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, StackAlign); + Chain = CWD.getValue(1); + CWD = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, CWD); + + if (!Subtarget.hasSSE1()) { + if (Op.getValueType() == MVT::i64) + CWD = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, CWD, + DAG.getUNDEF(MVT::i32)); + return DAG.getMergeValues({CWD, Chain}, DL); + } + + // If target supports SSE, read MXCSR as well. + assert(Op.getValueType() == MVT::i64); + SDValue MXCSRAddr = + DAG.getNode(ISD::ADD, DL, StackSlot.getValueType(), StackSlot, + DAG.getConstant(4, DL, StackSlot.getValueType())); + // Store MXCSR into memory. + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, + DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32), + MXCSRAddr); + + if (Subtarget.is64Bit()) + return DAG.getLoad(MVT::i64, DL, Chain, StackSlot, MPI); + + // Load MXCSR from stack slot and put it into upper 32 bits of result. + SDValue MXCSR = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI); + Chain = MXCSR.getValue(1); + SDValue Result = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, CWD, MXCSR); + return DAG.getMergeValues({Result, Chain}, DL); +} + +SDValue X86TargetLowering::LowerSET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + unsigned StackSize = Subtarget.hasSSE1() ? 8 : 2; + Align StackAlign(StackSize); + + // Store FP modes into stack slot. + int SSFI = MF.getFrameInfo().CreateStackObject(StackSize, StackAlign, false); + SDValue StackSlot = + DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); + SDValue Modes = Op.getOperand(1); + if (!Subtarget.hasSSE1() && Modes.getValueType() == MVT::i64) + Modes = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Modes); + Chain = DAG.getStore(Chain, DL, Modes, StackSlot, MPI); + + // Set X87 state. + SDValue Ops[] = {Chain, StackSlot}; + Chain = DAG.getMemIntrinsicNode(X86ISD::FLDCW16m, DL, + DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI, + StackAlign, MachineMemOperand::MOLoad); + if (!Subtarget.hasSSE1()) + return Chain; + + // If target supports SSE, set MXCSR as well. + assert(Modes.getValueType() == MVT::i64); + SDValue MXCSRAddr = + DAG.getNode(ISD::ADD, DL, StackSlot.getValueType(), StackSlot, + DAG.getConstant(4, DL, StackSlot.getValueType())); + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, + DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32), + MXCSRAddr); + return Chain; +} + +SDValue X86TargetLowering::LowerRESET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + + // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to + // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise + // for compatibility with glibc. + const auto &Subtarget = static_cast(DAG.getSubtarget()); + unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F; + auto X87Mode = ConstantInt::get(Type::getInt32Ty(*DAG.getContext()), X87CW); + MVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue Mode = DAG.getConstantPool(X87Mode, PtrVT); + SDValue Ops[] = {Chain, Mode}; + MachinePointerInfo MPI = + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); + Chain = DAG.getMemIntrinsicNode(X86ISD::FLDCW16m, DL, + DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI, + Align(2), MachineMemOperand::MOLoad); + if (!Subtarget.hasSSE1()) + return Chain; + + // MXCSR: mask all floating-point exceptions, sets rounding to nearest, + // clear all exceptions, sets DAZ and FTZ to 0. + auto MXCSR = ConstantInt::get(Type::getInt32Ty(*DAG.getContext()), 0x1F80); + Mode = DAG.getConstantPool(MXCSR, PtrVT); + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, + DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32), Mode); + return Chain; +} + /// Lower a vector CTLZ using native supported vector CTLZ instruction. // // i8/i16 vector implemented using dword LZCNT vector instruction @@ -30066,6 +30193,9 @@ case ISD::GC_TRANSITION_START: case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG); case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); + case ISD::GET_FPMODE: return LowerGET_FPMODE(Op, DAG); + case ISD::SET_FPMODE: return LowerSET_FPMODE(Op, DAG); + case ISD::RESET_FPMODE: return LowerRESET_FPMODE(Op, DAG); case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); } } @@ -30956,6 +31086,12 @@ // to move the scalar in two i32 pieces. Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG)); return; + case ISD::GET_FPMODE: { + SDValue V = LowerGET_FPMODE(SDValue(N, 0), DAG); + Results.push_back(V); + Results.push_back(V.getValue(1)); + return; + } } } @@ -31043,6 +31179,7 @@ NODE_NAME_CASE(EH_RETURN) NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(FNSTCW16m) + NODE_NAME_CASE(FLDCW16m) NODE_NAME_CASE(LCMPXCHG_DAG) NODE_NAME_CASE(LCMPXCHG8_DAG) NODE_NAME_CASE(LCMPXCHG16_DAG) diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td --- a/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/llvm/lib/Target/X86/X86InstrFPStack.td @@ -23,7 +23,7 @@ def SDTX86Fild : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; -def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86CwdAccess : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -35,9 +35,12 @@ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, +def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdAccess, [SDNPHasChain, SDNPMayStore, SDNPSideEffect, SDNPMemOperand]>; +def X86fp_cwd_set16 : SDNode<"X86ISD::FLDCW16m", SDTX86CwdAccess, + [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, + SDNPMemOperand]>; def X86fstf32 : PatFrag<(ops node:$val, node:$ptr), (X86fst node:$val, node:$ptr), [{ @@ -705,7 +708,8 @@ } // SchedRW let Defs = [FPSW,FPCW], mayLoad = 1 in def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] - (outs), (ins i16mem:$dst), "fldcw\t$dst", []>, + (outs), (ins i16mem:$dst), "fldcw\t$dst", + [(X86fp_cwd_set16 addr:$dst)]>, Sched<[WriteLoad]>; // FPU control instructions diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fpenv.ll @@ -0,0 +1,105 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse -verify-machineinstrs < %s | FileCheck %s -check-prefix=X86-NOSSE +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse -verify-machineinstrs < %s | FileCheck %s -check-prefix=X86-SSE +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s -check-prefix=X64 + + +define i64 @func_01() { +; X86-NOSSE-LABEL: func_01: +; X86-NOSSE: # %bb.0: # %entry +; X86-NOSSE-NEXT: subl $2, %esp +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 6 +; X86-NOSSE-NEXT: fnstcw (%esp) +; X86-NOSSE-NEXT: movzwl (%esp), %eax +; X86-NOSSE-NEXT: addl $2, %esp +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4 +; X86-NOSSE-NEXT: retl +; +; X86-SSE-LABEL: func_01: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: fnstcw (%esp) +; X86-SSE-NEXT: movl (%esp), %eax +; X86-SSE-NEXT: stmxcsr {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movl (%esp), %edx +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; X64-LABEL: func_01: +; X64: # %bb.0: # %entry +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: retq +entry: + %v = call i64 @llvm.get.fpmode.i64() + ret i64 %v +} + + +define void @func_02(i64 %fpe) { +; X86-NOSSE-LABEL: func_02: +; X86-NOSSE: # %bb.0: # %entry +; X86-NOSSE-NEXT: pushl %eax +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 8 +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: .cfi_def_cfa_offset 4 +; X86-NOSSE-NEXT: retl +; +; X86-SSE-LABEL: func_02: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 16 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: ldmxcsr {{[0-9]+}}(%esp) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: .cfi_def_cfa_offset 4 +; X86-SSE-NEXT: retl +; +; X64-LABEL: func_02: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: retq +entry: + call void @llvm.set.fpmode.i64(i64 %fpe) + ret void +} + + +define void @func_03() { +; X86-NOSSE-LABEL: func_03: +; X86-NOSSE: # %bb.0: # %entry +; X86-NOSSE-NEXT: fldcw {{\.LCPI.*}} +; X86-NOSSE-NEXT: retl +; +; X86-SSE-LABEL: func_03: +; X86-SSE: # %bb.0: # %entry +; X86-SSE-NEXT: fldcw {{\.LCPI.*}} +; X86-SSE-NEXT: ldmxcsr {{\.LCPI.*}} +; X86-SSE-NEXT: retl +; +; X64-LABEL: func_03: +; X64: # %bb.0: # %entry +; X64-NEXT: fldcw {{.*}}(%rip) +; X64-NEXT: ldmxcsr {{.*}}(%rip) +; X64-NEXT: retq +entry: + call void @llvm.reset.fpmode() + ret void +} + + +declare i64 @llvm.get.fpmode.i64() +declare void @llvm.set.fpmode.i64(i64 %fpenv) +declare void @llvm.reset.fpmode() diff --git a/llvm/test/CodeGen/X86/fpenv32.ll b/llvm/test/CodeGen/X86/fpenv32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fpenv32.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse -verify-machineinstrs < %s | FileCheck %s + + +define i64 @func_01() { +; CHECK-LABEL: func_01: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: subl $2, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 6 +; CHECK-NEXT: fnstcw (%esp) +; CHECK-NEXT: movzwl (%esp), %eax +; CHECK-NEXT: addl $2, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl +entry: + %v = call i64 @llvm.get.fpmode.i64() + ret i64 %v +} + + +define void @func_02(i64 %fpe) { +; CHECK-LABEL: func_02: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: popl %eax +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl +entry: + call void @llvm.set.fpmode.i64(i64 %fpe) + ret void +} + + +declare i64 @llvm.get.fpmode.i64() +declare void @llvm.set.fpmode.i64(i64 %fpenv)