diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -771,9 +771,12 @@ // scalar broadcast from memory VBROADCAST_LOAD, - // Store FP control world into i16 memory. + // Store FP control word into i16 memory. FNSTCW16m, + // Load FP control word from i16 memory. + FLDCW16m, + /// This instruction implements FP_TO_SINT with the /// integer destination in memory and a FP reg source. This corresponds /// to the X86::FIST*m instructions and the rounding mode change stuff. It @@ -1507,6 +1510,9 @@ SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGET_FPMODE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSET_FPMODE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRESET_FPMODE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -331,6 +331,11 @@ setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FREM , MVT::f128 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + setOperationAction(ISD::GET_FPMODE , MVT::i64 , Custom); + setOperationAction(ISD::GET_FPMODE , MVT::i32 , Custom); + setOperationAction(ISD::SET_FPMODE , MVT::i64 , Custom); + setOperationAction(ISD::SET_FPMODE , MVT::i32 , Custom); + setOperationAction(ISD::RESET_FPMODE , MVT::Other, Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. @@ -26137,6 +26142,127 @@ return DAG.getMergeValues({RetVal, Chain}, DL); } +SDValue X86TargetLowering::LowerGET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + unsigned StackSize = Subtarget.hasSSE1() ? 8 : 2; + Align StackAlign(StackSize); + + // Save FP Control Word to stack slot + int SSFI = MF.getFrameInfo().CreateStackObject(StackSize, StackAlign, false); + SDValue StackSlot = + DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); + + SDValue Ops[] = {Chain, StackSlot}; + Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, + DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI, + StackAlign, MachineMemOperand::MOStore); + + // Load FP Control Word from stack slot + SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, StackAlign); + Chain = CWD.getValue(1); + CWD = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, CWD); + + if (!Subtarget.hasSSE1()) { + if (Op.getValueType() == MVT::i64) + CWD = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, CWD, + DAG.getUNDEF(MVT::i32)); + return DAG.getMergeValues({CWD, Chain}, DL); + } + + // If target supports SSE, read MXCSR as well. + assert(Op.getValueType() == MVT::i64); + SDValue MXCSRAddr = + DAG.getNode(ISD::ADD, DL, StackSlot.getValueType(), StackSlot, + DAG.getConstant(4, DL, StackSlot.getValueType())); + // Store MXCSR into memory. + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, + DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32), + MXCSRAddr); + + if (Subtarget.is64Bit()) + return DAG.getLoad(MVT::i64, DL, Chain, StackSlot, MPI); + + // Load MXCSR from stack slot and put it into upper 32 bits of result. + SDValue MXCSR = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI); + Chain = MXCSR.getValue(1); + SDValue Result = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, CWD, MXCSR); + return DAG.getMergeValues({Result, Chain}, DL); +} + +SDValue X86TargetLowering::LowerSET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + unsigned StackSize = Subtarget.hasSSE1() ? 8 : 2; + Align StackAlign(StackSize); + + // Store FP modes into stack slot. + int SSFI = MF.getFrameInfo().CreateStackObject(StackSize, StackAlign, false); + SDValue StackSlot = + DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); + SDValue Modes = Op.getOperand(1); + if (!Subtarget.hasSSE1() && Modes.getValueType() == MVT::i64) + Modes = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Modes); + Chain = DAG.getStore(Chain, DL, Modes, StackSlot, MPI); + + // Set X87 state. + SDValue Ops[] = {Chain, StackSlot}; + Chain = DAG.getMemIntrinsicNode(X86ISD::FLDCW16m, DL, + DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI, + StackAlign, MachineMemOperand::MOLoad); + if (!Subtarget.hasSSE1()) + return Chain; + + // If target supports SSE, set MXCSR as well. + assert(Modes.getValueType() == MVT::i64); + SDValue MXCSRAddr = + DAG.getNode(ISD::ADD, DL, StackSlot.getValueType(), StackSlot, + DAG.getConstant(4, DL, StackSlot.getValueType())); + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, + DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32), + MXCSRAddr); + return Chain; +} + +SDValue X86TargetLowering::LowerRESET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + + // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to + // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise + // for compatibility with glibc. + const auto &Subtarget = static_cast(DAG.getSubtarget()); + unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F; + auto X87Mode = ConstantInt::get(Type::getInt32Ty(*DAG.getContext()), X87CW); + SDValue Mode = DAG.getConstantPool(X87Mode, MVT::i32); + SDValue Ops[] = {Chain, Mode}; + MachinePointerInfo MPI = + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); + Chain = DAG.getMemIntrinsicNode(X86ISD::FLDCW16m, DL, + DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI, + Align(2), MachineMemOperand::MOLoad); + if (!Subtarget.hasSSE1()) + return Chain; + + // MXCSR: mask all floating-point exceptions, sets rounding to nearest, + // clear all exceptions, sets DAZ and FTZ to 0. + auto MXCSR = ConstantInt::get(Type::getInt32Ty(*DAG.getContext()), 0x1F80); + Mode = DAG.getConstantPool(MXCSR, MVT::i32); + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, + DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32), Mode); + return Chain; +} + /// Lower a vector CTLZ using native supported vector CTLZ instruction. // // i8/i16 vector implemented using dword LZCNT vector instruction @@ -29267,6 +29393,9 @@ case ISD::GC_TRANSITION_START: case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG); case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); + case ISD::GET_FPMODE: return LowerGET_FPMODE(Op, DAG); + case ISD::SET_FPMODE: return LowerSET_FPMODE(Op, DAG); + case ISD::RESET_FPMODE: return LowerRESET_FPMODE(Op, DAG); case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); } } @@ -30218,6 +30347,12 @@ Results.push_back(V); return; } + case ISD::GET_FPMODE: { + SDValue V = LowerGET_FPMODE(SDValue(N, 0), DAG); + Results.push_back(V); + Results.push_back(V.getValue(1)); + return; + } } } @@ -30304,6 +30439,7 @@ NODE_NAME_CASE(EH_RETURN) NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(FNSTCW16m) + NODE_NAME_CASE(FLDCW16m) NODE_NAME_CASE(LCMPXCHG_DAG) NODE_NAME_CASE(LCMPXCHG8_DAG) NODE_NAME_CASE(LCMPXCHG16_DAG) diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td --- a/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/llvm/lib/Target/X86/X86InstrFPStack.td @@ -23,7 +23,7 @@ def SDTX86Fild : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; -def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86CwdAccess : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -35,9 +35,12 @@ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; -def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, +def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdAccess, [SDNPHasChain, SDNPMayStore, SDNPSideEffect, SDNPMemOperand]>; +def X86fp_cwd_set16 : SDNode<"X86ISD::FLDCW16m", SDTX86CwdAccess, + [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, + SDNPMemOperand]>; def X86fstf32 : PatFrag<(ops node:$val, node:$ptr), (X86fst node:$val, node:$ptr), [{ @@ -705,7 +708,8 @@ } // SchedRW let Defs = [FPSW,FPCW], mayLoad = 1 in def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] - (outs), (ins i16mem:$dst), "fldcw\t$dst", []>, + (outs), (ins i16mem:$dst), "fldcw\t$dst", + [(X86fp_cwd_set16 addr:$dst)]>, Sched<[WriteLoad]>; // FPU control instructions diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fpenv.ll @@ -0,0 +1,70 @@ +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,X86-NOSSE +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,X86-SSE +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,X64 + + +define i64 @func_01() { +entry: + %v = call i64 @llvm.get.fpmode.i64() + ret i64 %v +} +; CHECK-LABEL: func_01: + +; X86-NOSSE: fnstcw (%esp) +; X86-NOSSE-NEXT: movzwl (%esp), %eax + +; X86-SSE: fnstcw (%esp) +; X86-SSE-NEXT: movl (%esp), %eax +; X86-SSE-NEXT: stmxcsr 4(%esp) +; X86-SSE-NEXT: movl (%esp), %edx + +; X64: fnstcw -8(%rsp) +; X64-NEXT: stmxcsr -4(%rsp) +; X64-NEXT: movq -8(%rsp), %rax +; X64-NEXT: retq + + +define void @func_02(i64 %fpe) { +entry: + call void @llvm.set.fpmode.i64(i64 %fpe) + ret void +} +; CHECK-LABEL: func_02: + +; X86-NOSSE: movl 8(%esp), %eax +; X86-NOSSE-NEXT: movl %eax, 2(%esp) +; X86-NOSSE-NEXT: fldcw 2(%esp) + +; X86-SSE: movl %ecx, 4(%esp) +; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: ldmxcsr 4(%esp) + +; X64: movq %rdi, -8(%rsp) +; X64-NEXT: fldcw -8(%rsp) +; X64-NEXT: ldmxcsr -4(%rsp) +; X64-NEXT: retq + + +define void @func_03() { +entry: + call void @llvm.reset.fpmode() + ret void +} +; CHECK-LABEL: func_03: + +; X86-NOSSE: fldcw .LCPI +; X86-NOSSE-NEXT: retl + +; X86-SSE: fldcw .LCPI +; X86-SSE-NEXT: ldmxcsr .LCPI +; X86-SSE-NEXT: retl + +; X64: fldcw .LCPI{{.*}}(%rip) +; X64-NEXT: ldmxcsr .LCPI{{.*}}(%rip) +; X64-NEXT: retq + + +declare i64 @llvm.get.fpmode.i64() +declare void @llvm.set.fpmode.i64(i64 %fpenv) +declare void @llvm.reset.fpmode() diff --git a/llvm/test/CodeGen/X86/fpenv32.ll b/llvm/test/CodeGen/X86/fpenv32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fpenv32.ll @@ -0,0 +1,26 @@ +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse -verify-machineinstrs < %s | FileCheck %s + + +define i32 @func_01() { +entry: + %v = call i32 @llvm.get.fpmode.i32() + ret i32 %v +} +; CHECK-LABEL: func_01: +; CHECK: fnstcw (%esp) +; CHECK-NEXT: movzwl (%esp), %eax + + +define void @func_02(i32 %fpe) { +entry: + call void @llvm.set.fpmode.i32(i32 %fpe) + ret void +} +; CHECK-LABEL: func_02: +; CHECK: movl 8(%esp), %eax +; CHECK-NEXT: movl %eax, 2(%esp) +; CHECK-NEXT: fldcw 2(%esp) + + +declare i32 @llvm.get.fpmode.i32() +declare void @llvm.set.fpmode.i32(i32 %fpenv)