diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -650,9 +650,12 @@ // scalar broadcast from memory VBROADCAST_LOAD, - // Store FP control world into i16 memory. + // Store FP control word into i16 memory. FNSTCW16m, + // Load FP control word from i16 memory. + FLDCW16m, + /// This instruction implements FP_TO_SINT with the /// integer destination in memory and a FP reg source. This corresponds /// to the X86::FIST*m instructions and the rounding mode change stuff. It @@ -703,6 +706,19 @@ }; } // end namespace X86ISD + namespace X86 { + /// Current rounding mode is represented in bits 11:10 of FPSR. These + /// values are same as corresponding constants for rounding mode used + /// in glibc. + enum RoundingMode { + rmToNearest = 0, // FE_TONEAREST + rmDownward = 1 << 10, // FE_DOWNWARD + rmUpward = 2 << 10, // FE_UPWARD + rmTowardZero = 3 << 10, // FE_TOWARDZERO + rmMask = 3 << 10 // Bit mask selecting rounding mode + }; + } + /// Define some predicates that are used for node matching. namespace X86 { /// Returns true if Elt is a constant zero or floating point constant +0.0. @@ -1369,6 +1385,7 @@ SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -336,6 +336,7 @@ setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FREM , MVT::f128 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. @@ -25759,6 +25760,119 @@ return DAG.getZExtOrTrunc(RetVal, DL, VT); } +SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); + SDValue Chain = Op.getNode()->getOperand(0); + + // FP control word may be set only from data in memory. So we need to allocate + // stack space to save/load FP control word. + int OldCWFrameIdx = + MF.getFrameInfo().CreateStackObject(4, 4, false); + SDValue StackSlot = + DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout())); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MPI, MachineMemOperand::MOStore, 2, 2); + + // Store FP control word into memory. + SDValue Ops[] = { Chain, StackSlot }; + Chain = DAG.getMemIntrinsicNode( + X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO); + + // Load FP Control Word from stack slot and clear RM field (bits 11:10). + SDValue CWD = + DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI); + Chain = CWD.getValue(1); + CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0), + DAG.getConstant(0xf3ff, DL, MVT::i16)); + + // Calculate new rounding mode. + SDValue NewRM = Op.getNode()->getOperand(1); + SDValue RMBits; + if (auto *CVal = dyn_cast(NewRM)) { + uint64_t RM = CVal->getZExtValue(); + int FieldVal; + switch (RM) { + case llvm::ToNearestTiesToEven: FieldVal = X86::rmToNearest; break; + case llvm::Downward: FieldVal = X86::rmDownward; break; + case llvm::Upward: FieldVal = X86::rmUpward; break; + case llvm::TowardZero: FieldVal = X86::rmTowardZero; break; + default: + llvm_unreachable("rounding mode is not supported by X86 hardware"); + } + RMBits = DAG.getConstant(FieldVal, DL, MVT::i16); + } else { + // Need to convert argument into bits of control word: + // 0 Round to 0 -> 11 + // 1 Round to nearest -> 00 + // 2 Round to +inf -> 10 + // 3 Round to -inf -> 01 + // The 2-bit value needs then to be shifted so that it occupies bits 11:10. + // To make the conversion, put all these values into a value 0xc9 and shift + // it left depending on the rounding mode: + // (0xc9 << 4) & 0xc00 = X86::rmTowardZero + // (0xc9 << 6) & 0xc00 = X86::rmToNearest + // ... + // (0xc9 << (2 * NewRM + 4)) & 0xc00 + SDValue ShiftValue = + DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, + DAG.getNode(ISD::ADD, DL, MVT::i32, + DAG.getNode(ISD::SHL, DL, MVT::i32, + NewRM, + DAG.getConstant(1, DL, MVT::i8)), + DAG.getConstant(4, DL, MVT::i32))); + SDValue Shifted = + DAG.getNode(ISD::SHL, DL, MVT::i16, + DAG.getConstant(0xc9, DL, MVT::i16), + ShiftValue); + RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, + Shifted, + DAG.getConstant(0xc00, DL, MVT::i16)); + } + + // Update rounding mode bits and store the new FP Control Word into stack. + CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits); + Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2); + + // Load FP control word from the slot. + SDValue OpsLD[] = { Chain, StackSlot }; + MachineMemOperand *MMOL = + MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, 2); + Chain = DAG.getMemIntrinsicNode( + X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL); + + // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the + // same way but in bits 14:13. + if (Subtarget.hasSSE1()) { + // Store MXCSR into memory. + Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, + DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32), StackSlot); + + // Load MXCSR from stack slot and clear RM field (bits 14:13). + SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI); + Chain = CWD.getValue(1); + CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0), + DAG.getConstant(0xffff9fff, DL, MVT::i32)); + + // Shift X87 RM bits from 11:10 to 14:13. + RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits); + RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits, + DAG.getConstant(3, DL, MVT::i8)); + + // Update rounding mode bits and store the new FP Control Word into stack. + CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits); + Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4); + + // Load MXCSR from the slot. + Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, + DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32), StackSlot); + } + + return Chain; +} + // Split an unary integer op into 2 half sized ops. static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); @@ -28943,6 +29057,7 @@ case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); case ISD::CTTZ: @@ -30006,6 +30121,7 @@ NODE_NAME_CASE(EH_RETURN) NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(FNSTCW16m) + NODE_NAME_CASE(FLDCW16m) NODE_NAME_CASE(LCMPXCHG_DAG) NODE_NAME_CASE(LCMPXCHG8_DAG) NODE_NAME_CASE(LCMPXCHG16_DAG) diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td --- a/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/llvm/lib/Target/X86/X86InstrFPStack.td @@ -24,6 +24,7 @@ def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86CwdLoad : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -38,6 +39,9 @@ def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, [SDNPHasChain, SDNPMayStore, SDNPSideEffect, SDNPMemOperand]>; +def X86fp_cwd_set16 : SDNode<"X86ISD::FLDCW16m", SDTX86CwdLoad, + [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, + SDNPMemOperand]>; def X86fstf32 : PatFrag<(ops node:$val, node:$ptr), (X86fst node:$val, node:$ptr), [{ @@ -703,7 +707,8 @@ } // SchedRW let Defs = [FPSW,FPCW], mayLoad = 1 in def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] - (outs), (ins i16mem:$dst), "fldcw\t$dst", []>, + (outs), (ins i16mem:$dst), "fldcw\t$dst", + [(X86fp_cwd_set16 addr:$dst)]>, Sched<[WriteLoad]>; // FPU control instructions diff --git a/llvm/test/CodeGen/X86/set_rounding.ll b/llvm/test/CodeGen/X86/set_rounding.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/set_rounding.ll @@ -0,0 +1,230 @@ +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse -verify-machineinstrs < %s | FileCheck %s -check-prefix=X86-NOSSE +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse -verify-machineinstrs < %s | FileCheck %s -check-prefix=X86-SSE +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s -check-prefix=X64 + +declare void @llvm.set.rounding(i32 %x) + +define void @func_01() nounwind { + call void @llvm.set.rounding(i32 0) ; TowardZero (CW[11-10] = 11) + ret void + +; X86-NOSSE-LABEL: func_01: +; X86-NOSSE: fnstcw (%esp) +; X86-NOSSE-NEXT: orb $12, 1(%esp) +; X86-NOSSE-NEXT: fldcw (%esp) +; +; X86-SSE-LABEL: func_01: +; X86-SSE: fnstcw (%esp) +; X86-SSE-NEXT: orb $12, 1(%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: stmxcsr (%esp) + ; 96 = 0x60 +; X86-SSE-NEXT: orb $96, 1(%esp) +; X86-SSE-NEXT: ldmxcsr (%esp) +; +; X64-LABEL: func_01: +; X64: fnstcw -4(%rsp) +; X64-NEXT: orb $12, -3(%rsp) +; X64-NEXT: fldcw -4(%rsp) +; X64-NEXT: stmxcsr -4(%rsp) + ; 96 = 0x60 +; X64-NEXT: orb $96, -3(%rsp) +; X64-NEXT: ldmxcsr -4(%rsp) +} + +define void @func_02() nounwind { + call void @llvm.set.rounding(i32 1) ; ToNearestTiesToEven (CW[11-10] = 00) + ret void + +; X86-NOSSE-LABEL: func_02: +; X86-NOSSE: fnstcw (%esp) + ; -13 = 0xF3 +; X86-NOSSE-NEXT: andb $-13, 1(%esp) +; X86-NOSSE-NEXT: fldcw (%esp) +; +; X86-SSE-LABEL: func_02: +; X86-SSE: fnstcw (%esp) + ; -13 = 0xF3 +; X86-SSE-NEXT: andb $-13, 1(%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: stmxcsr (%esp) + ; -97 = 0x9F +; X86-SSE-NEXT: andb $-97, 1(%esp) +; X86-SSE-NEXT: ldmxcsr (%esp) +; +; X64-LABEL: func_02: +; X64: fnstcw -4(%rsp) + ; -13 = 0xF3 +; X64-NEXT: andb $-13, -3(%rsp) +; X64-NEXT: fldcw -4(%rsp) +; X64-NEXT: stmxcsr -4(%rsp) + ; -97 = 0x9F +; X64-NEXT: andb $-97, -3(%rsp) +; X64-NEXT: ldmxcsr -4(%rsp) +} + +define void @func_03() nounwind { + call void @llvm.set.rounding(i32 2) ; Upward (CW[11-10] = 10) + ret void + +; X86-NOSSE-LABEL: func_03: +; X86-NOSSE: fnstcw (%esp) + ; -3073 = 0xF3FF +; X86-NOSSE-NEXT: movl $-3073, %eax +; X86-NOSSE-NEXT: andl (%esp), %eax + ; 2048 = 0x0800 +; X86-NOSSE-NEXT: orl $2048, %eax +; X86-NOSSE-NEXT: movw %ax, (%esp) +; X86-NOSSE-NEXT: fldcw (%esp) +; +; X86-SSE-LABEL: func_03: +; X86-SSE: fnstcw (%esp) + ; -3073 = 0xF3FF +; X86-SSE-NEXT: movl $-3073, %eax +; X86-SSE-NEXT: andl (%esp), %eax + ; 2048 = 0x0800 +; X86-SSE-NEXT: orl $2048, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: stmxcsr (%esp) + ; -24577 = 0xFFFF9FFF +; X86-SSE-NEXT: movl $-24577, %eax +; X86-SSE-NEXT: andl (%esp), %eax + ; 16384 = 0x4000 +; X86-SSE-NEXT: orl $16384, %eax +; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: ldmxcsr (%esp) +; +; X64-LABEL: func_03: +; X64: fnstcw -4(%rsp) + ; -3073 = 0xF3FF +; X64-NEXT: movl $-3073, %eax +; X64-NEXT: andl -4(%rsp), %eax + ; 2048 = 0x0800 +; X64-NEXT: orl $2048, %eax +; X64-NEXT: movw %ax, -4(%rsp) +; X64-NEXT: fldcw -4(%rsp) +; X64-NEXT: stmxcsr -4(%rsp) + ; -24577 = 0xFFFF9FFF +; X64-NEXT: movl $-24577, %eax +; X64-NEXT: andl -4(%rsp), %eax + ; 16384 = 0x4000 +; X64-NEXT: orl $16384, %eax +; X64-NEXT: movl %eax, -4(%rsp) +; X64-NEXT: ldmxcsr -4(%rsp) +} + +define void @func_04() nounwind { + call void @llvm.set.rounding(i32 3) ; Downward (CW[11-10] = 01) + ret void + +; X86-NOSSE-LABEL: func_04: +; X86-NOSSE: fnstcw (%esp) + ; -3073 = 0xFF3F +; X86-NOSSE-NEXT: movl $-3073, %eax +; X86-NOSSE-NEXT: andl (%esp), %eax + ; 1024 = 0x400 +; X86-NOSSE-NEXT: orl $1024, %eax +; X86-NOSSE-NEXT: movw %ax, (%esp) +; X86-NOSSE-NEXT: fldcw (%esp) +; +; X86-SSE-LABEL: func_04: +; X86-SSE: fnstcw (%esp) + ; -3073 = 0xFF3F +; X86-SSE-NEXT: movl $-3073, %eax +; X86-SSE-NEXT: andl (%esp), %eax + ; 1024 = 0x400 +; X86-SSE-NEXT: orl $1024, %eax +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: stmxcsr (%esp) + ; -24577 = 0x9FFF +; X86-SSE-NEXT: movl $-24577, %eax +; X86-SSE-NEXT: andl (%esp), %eax + ; 8192 = 0x4000 +; X86-SSE-NEXT: orl $8192, %eax +; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: ldmxcsr (%esp) +; +; X64-LABEL: func_04: +; X64: fnstcw -4(%rsp) + ; -3073 = 0xFF3F +; X64-NEXT: movl $-3073, %eax +; X64-NEXT: andl -4(%rsp), %eax + ; 1024 = 0x400 +; X64-NEXT: orl $1024, %eax +; X64-NEXT: movw %ax, -4(%rsp) +; X64-NEXT: fldcw -4(%rsp) +; X64-NEXT: stmxcsr -4(%rsp) + ; -24577 = 0x9FFF +; X64-NEXT: movl $-24577, %eax +; X64-NEXT: andl -4(%rsp), %eax + ; 8192 = 0x4000 +; X64-NEXT: orl $8192, %eax +; X64-NEXT: movl %eax, -4(%rsp) +; X64-NEXT: ldmxcsr -4(%rsp) +; X64-NEXT: retq +} + +define void @func_05(i32 %x) nounwind { + call void @llvm.set.rounding(i32 %x) ; Downward + ret void + +; X86-NOSSE-LABEL: func_05: +; X86-NOSSE: leal 4(%eax,%eax), %ecx + ; 201 = 0xC9 +; X86-NOSSE: movl $201, %eax +; X86-NOSSE: shll %cl, %eax + ; 3072 = 0x0C00 +; X86-NOSSE-NEXT: andl $3072, %eax +; X86-NOSSE-NEXT: fnstcw (%esp) + ; -3073 = 0xF3FF +; X86-NOSSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF +; X86-NOSSE-NEXT: andl (%esp), %ecx +; X86-NOSSE-NEXT: orl %eax, %ecx +; X86-NOSSE-NEXT: movw %cx, (%esp) +; X86-NOSSE-NEXT: fldcw (%esp) +; +; X86-SSE-LABEL: func_05: +; X86-SSE: leal 4(%eax,%eax), %ecx + ; 201 = 0xC9 +; X86-SSE: movl $201, %eax +; X86-SSE: shll %cl, %eax + ; 3072 = 0x0C00 +; X86-SSE-NEXT: andl $3072, %eax +; X86-SSE-NEXT: fnstcw (%esp) + ; -3073 = 0xF3FF +; X86-SSE-NEXT: movl $-3073, %ecx +; X86-SSE-NEXT: andl (%esp), %ecx +; X86-SSE-NEXT: orl %eax, %ecx +; X86-SSE-NEXT: movw %cx, (%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: stmxcsr (%esp) + ; -24577 = 0xFFFF9FFF +; X86-SSE-NEXT: movl $-24577, %ecx +; X86-SSE-NEXT: andl (%esp), %ecx +; X86-SSE-NEXT: leal (%ecx,%eax,8), %eax +; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: ldmxcsr (%esp) +; +; X64-LABEL: func_05: +; X64: leal 4(%rdi,%rdi), %ecx +; X64: movl $201, %eax +; X64: shll %cl, %eax + ; 3072 = 0x0C00 +; X64-NEXT: andl $3072, %eax +; X64-NEXT: fnstcw -4(%rsp) + ; -3073 = 0xF3FF +; X64-NEXT: movl $-3073, %ecx +; X64-NEXT: andl -4(%rsp), %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movw %cx, -4(%rsp) +; X64-NEXT: fldcw -4(%rsp) +; X64-NEXT: stmxcsr -4(%rsp) + ; -24577 = 0xFFFF9FFF +; X64-NEXT: movl $-24577, %ecx +; X64-NEXT: andl -4(%rsp), %ecx +; X64-NEXT: leal (%rcx,%rax,8), %eax +; X64-NEXT: movl %eax, -4(%rsp) +; X64-NEXT: ldmxcsr -4(%rsp) +}