diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -783,6 +783,9 @@ // Store FP control world into i16 memory. FNSTCW16m, + // Load FP control word from i16 memory. + FLDCW16m, + /// This instruction implements FP_TO_SINT with the /// integer destination in memory and a FP reg source. This corresponds /// to the X86::FIST*m instructions and the rounding mode change stuff. It @@ -847,6 +850,20 @@ }; } // end namespace X86ISD + namespace X86 { + /// Current rounding mode is represented in bits 11:10 of FPSR. Constants + /// defined here represent values of FPSR for supported rounding modes with + /// other bits set to zero. The same values are used by glibc to represent + /// rounding modes. + enum RoundingMode { + rmToNearest = 0, // FE_TONEAREST + rmDownward = 1 << 10, // FE_DOWNWARD + rmUpward = 2 << 10, // FE_UPWARD + rmTowardZero = 3 << 10, // FE_TOWARDZERO + rmMask = 3 << 10 // Bit mask selecting rounding mode + }; + } // end namespace X86 + /// Define some predicates that are used for node matching. namespace X86 { /// Returns true if Elt is a constant zero or floating point constant +0.0. @@ -1521,6 +1538,7 @@ SDValue lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -359,6 +359,7 @@ setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FREM , MVT::f128 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); + setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. @@ -26930,6 +26931,118 @@ return DAG.getMergeValues({RetVal, Chain}, DL); } +SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SDLoc DL(Op); + SDValue Chain = Op.getNode()->getOperand(0); + + // FP control word may be set only from data in memory. So we need to allocate + // stack space to save/load FP control word. + int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false); + SDValue StackSlot = + DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout())); + MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx); + MachineMemOperand *MMO = + MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2)); + + // Store FP control word into memory. + SDValue Ops[] = {Chain, StackSlot}; + Chain = DAG.getMemIntrinsicNode( + X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO); + + // Load FP Control Word from stack slot and clear RM field (bits 11:10). + SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI); + Chain = CWD.getValue(1); + CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0), + DAG.getConstant(0xf3ff, DL, MVT::i16)); + + // Calculate new rounding mode. + SDValue NewRM = Op.getNode()->getOperand(1); + SDValue RMBits; + if (auto *CVal = dyn_cast(NewRM)) { + uint64_t RM = CVal->getZExtValue(); + int FieldVal; + switch (static_cast(RM)) { + case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break; + case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break; + case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break; + case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break; + default: + llvm_unreachable("rounding mode is not supported by X86 hardware"); + } + RMBits = DAG.getConstant(FieldVal, DL, MVT::i16); + } else { + // Need to convert argument into bits of control word: + // 0 Round to 0 -> 11 + // 1 Round to nearest -> 00 + // 2 Round to +inf -> 10 + // 3 Round to -inf -> 01 + // The 2-bit value needs then to be shifted so that it occupies bits 11:10. + // To make the conversion, put all these values into a value 0xc9 and shift + // it left depending on the rounding mode: + // (0xc9 << 4) & 0xc00 = X86::rmTowardZero + // (0xc9 << 6) & 0xc00 = X86::rmToNearest + // ... + // (0xc9 << (2 * NewRM + 4)) & 0xc00 + SDValue ShiftValue = + DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, + DAG.getNode(ISD::ADD, DL, MVT::i32, + DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM, + DAG.getConstant(1, DL, MVT::i8)), + DAG.getConstant(4, DL, MVT::i32))); + SDValue Shifted = + DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16), + ShiftValue); + RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted, + DAG.getConstant(0xc00, DL, MVT::i16)); + } + + // Update rounding mode bits and store the new FP Control Word into stack. + CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits); + Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 2); + + // Load FP control word from the slot. + SDValue OpsLD[] = {Chain, StackSlot}; + MachineMemOperand *MMOL = + MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2)); + Chain = DAG.getMemIntrinsicNode( + X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL); + + // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the + // same way but in bits 14:13. + if (Subtarget.hasSSE1()) { + // Store MXCSR into memory. + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, + DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32), + StackSlot); + + // Load MXCSR from stack slot and clear RM field (bits 14:13). + SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI); + Chain = CWD.getValue(1); + CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0), + DAG.getConstant(0xffff9fff, DL, MVT::i32)); + + // Shift X87 RM bits from 11:10 to 14:13. + RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits); + RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits, + DAG.getConstant(3, DL, MVT::i8)); + + // Update rounding mode bits and store the new FP Control Word into stack. + CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits); + Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, /* Alignment = */ 4); + + // Load MXCSR from the slot. + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, + DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32), + StackSlot); + } + + return Chain; +} + /// Lower a vector CTLZ using native supported vector CTLZ instruction. // // i8/i16 vector implemented using dword LZCNT vector instruction @@ -30025,6 +30138,7 @@ case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); case ISD::CTLZ: case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); case ISD::CTTZ: @@ -31045,6 +31159,7 @@ NODE_NAME_CASE(EH_RETURN) NODE_NAME_CASE(TC_RETURN) NODE_NAME_CASE(FNSTCW16m) + NODE_NAME_CASE(FLDCW16m) NODE_NAME_CASE(LCMPXCHG_DAG) NODE_NAME_CASE(LCMPXCHG8_DAG) NODE_NAME_CASE(LCMPXCHG16_DAG) diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td --- a/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/llvm/lib/Target/X86/X86InstrFPStack.td @@ -24,6 +24,7 @@ def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>; def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def SDTX86CwdLoad : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -38,6 +39,9 @@ def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, [SDNPHasChain, SDNPMayStore, SDNPSideEffect, SDNPMemOperand]>; +def X86fp_cwd_set16 : SDNode<"X86ISD::FLDCW16m", SDTX86CwdLoad, + [SDNPHasChain, SDNPMayLoad, SDNPSideEffect, + SDNPMemOperand]>; def X86fstf32 : PatFrag<(ops node:$val, node:$ptr), (X86fst node:$val, node:$ptr), [{ @@ -705,7 +709,8 @@ } // SchedRW let Defs = [FPSW,FPCW], mayLoad = 1 in def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] - (outs), (ins i16mem:$dst), "fldcw\t$dst", []>, + (outs), (ins i16mem:$dst), "fldcw\t$dst", + [(X86fp_cwd_set16 addr:$dst)]>, Sched<[WriteLoad]>; // FPU control instructions diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fpenv.ll @@ -0,0 +1,244 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=-sse -verify-machineinstrs < %s | FileCheck %s -check-prefix=X86-NOSSE +; RUN: llc -mtriple=i686-unknown-linux-gnu -mattr=+sse -verify-machineinstrs < %s | FileCheck %s -check-prefix=X86-SSE +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -verify-machineinstrs < %s | FileCheck %s -check-prefix=X64 + +declare void @llvm.set.rounding(i32 %x) + +define void @func_01() nounwind { +; X86-NOSSE-LABEL: func_01: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %eax +; X86-NOSSE-NEXT: fnstcw (%esp) +; X86-NOSSE-NEXT: orb $12, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: fldcw (%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: retl +; +; X86-SSE-LABEL: func_01: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: fnstcw (%esp) +; X86-SSE-NEXT: orb $12, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: stmxcsr (%esp) +; X86-SSE-NEXT: orb $96, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: ldmxcsr (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: retl +; +; X64-LABEL: func_01: +; X64: # %bb.0: +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: orb $12, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: orb $96, -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: retq + call void @llvm.set.rounding(i32 0) ; TowardZero (CW[11-10] = 11) + ret void +} + +define void @func_02() nounwind { +; X86-NOSSE-LABEL: func_02: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %eax +; X86-NOSSE-NEXT: fnstcw (%esp) +; X86-NOSSE-NEXT: andb $-13, {{[0-9]+}}(%esp) +; X86-NOSSE-NEXT: fldcw (%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: retl +; +; X86-SSE-LABEL: func_02: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: fnstcw (%esp) +; X86-SSE-NEXT: andb $-13, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: stmxcsr (%esp) +; X86-SSE-NEXT: andb $-97, {{[0-9]+}}(%esp) +; X86-SSE-NEXT: ldmxcsr (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: retl +; +; X64-LABEL: func_02: +; X64: # %bb.0: +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: andb $-13, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: andb $-97, -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: retq + call void @llvm.set.rounding(i32 1) ; ToNearestTiesToEven (CW[11-10] = 00) + ret void +} + +define void @func_03() nounwind { +; X86-NOSSE-LABEL: func_03: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %eax +; X86-NOSSE-NEXT: fnstcw (%esp) +; X86-NOSSE-NEXT: movl $-3073, %eax # imm = 0xF3FF +; X86-NOSSE-NEXT: andl (%esp), %eax +; X86-NOSSE-NEXT: orl $2048, %eax # imm = 0x800 +; X86-NOSSE-NEXT: movw %ax, (%esp) +; X86-NOSSE-NEXT: fldcw (%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: retl +; +; X86-SSE-LABEL: func_03: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: fnstcw (%esp) +; X86-SSE-NEXT: movl $-3073, %eax # imm = 0xF3FF +; X86-SSE-NEXT: andl (%esp), %eax +; X86-SSE-NEXT: orl $2048, %eax # imm = 0x800 +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: stmxcsr (%esp) +; X86-SSE-NEXT: movl $-24577, %eax # imm = 0x9FFF +; X86-SSE-NEXT: andl (%esp), %eax +; X86-SSE-NEXT: orl $16384, %eax # imm = 0x4000 +; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: ldmxcsr (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: retl +; +; X64-LABEL: func_03: +; X64: # %bb.0: +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movl $-3073, %eax # imm = 0xF3FF +; X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $2048, %eax # imm = 0x800 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: movl $-24577, %eax # imm = 0x9FFF +; X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $16384, %eax # imm = 0x4000 +; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: retq + call void @llvm.set.rounding(i32 2) ; Upward (CW[11-10] = 10) + ret void +} + +define void @func_04() nounwind { +; X86-NOSSE-LABEL: func_04: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %eax +; X86-NOSSE-NEXT: fnstcw (%esp) +; X86-NOSSE-NEXT: movl $-3073, %eax # imm = 0xF3FF +; X86-NOSSE-NEXT: andl (%esp), %eax +; X86-NOSSE-NEXT: orl $1024, %eax # imm = 0x400 +; X86-NOSSE-NEXT: movw %ax, (%esp) +; X86-NOSSE-NEXT: fldcw (%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: retl +; +; X86-SSE-LABEL: func_04: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: fnstcw (%esp) +; X86-SSE-NEXT: movl $-3073, %eax # imm = 0xF3FF +; X86-SSE-NEXT: andl (%esp), %eax +; X86-SSE-NEXT: orl $1024, %eax # imm = 0x400 +; X86-SSE-NEXT: movw %ax, (%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: stmxcsr (%esp) +; X86-SSE-NEXT: movl $-24577, %eax # imm = 0x9FFF +; X86-SSE-NEXT: andl (%esp), %eax +; X86-SSE-NEXT: orl $8192, %eax # imm = 0x2000 +; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: ldmxcsr (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: retl +; +; X64-LABEL: func_04: +; X64: # %bb.0: +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movl $-3073, %eax # imm = 0xF3FF +; X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $1024, %eax # imm = 0x400 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: movl $-24577, %eax # imm = 0x9FFF +; X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $8192, %eax # imm = 0x2000 +; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: retq + call void @llvm.set.rounding(i32 3) ; Downward (CW[11-10] = 01) + ret void +} + +define void @func_05(i32 %x) nounwind { +; X86-NOSSE-LABEL: func_05: +; X86-NOSSE: # %bb.0: +; X86-NOSSE-NEXT: pushl %eax +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NOSSE-NEXT: leal 4(%eax,%eax), %ecx +; X86-NOSSE-NEXT: movl $201, %eax +; X86-NOSSE-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NOSSE-NEXT: shll %cl, %eax +; X86-NOSSE-NEXT: andl $3072, %eax # imm = 0xC00 +; X86-NOSSE-NEXT: fnstcw (%esp) +; X86-NOSSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF +; X86-NOSSE-NEXT: andl (%esp), %ecx +; X86-NOSSE-NEXT: orl %eax, %ecx +; X86-NOSSE-NEXT: movw %cx, (%esp) +; X86-NOSSE-NEXT: fldcw (%esp) +; X86-NOSSE-NEXT: popl %eax +; X86-NOSSE-NEXT: retl +; +; X86-SSE-LABEL: func_05: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %eax +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: leal 4(%eax,%eax), %ecx +; X86-SSE-NEXT: movl $201, %eax +; X86-SSE-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SSE-NEXT: shll %cl, %eax +; X86-SSE-NEXT: andl $3072, %eax # imm = 0xC00 +; X86-SSE-NEXT: fnstcw (%esp) +; X86-SSE-NEXT: movl $-3073, %ecx # imm = 0xF3FF +; X86-SSE-NEXT: andl (%esp), %ecx +; X86-SSE-NEXT: orl %eax, %ecx +; X86-SSE-NEXT: movw %cx, (%esp) +; X86-SSE-NEXT: fldcw (%esp) +; X86-SSE-NEXT: stmxcsr (%esp) +; X86-SSE-NEXT: movl $-24577, %ecx # imm = 0x9FFF +; X86-SSE-NEXT: andl (%esp), %ecx +; X86-SSE-NEXT: leal (%ecx,%eax,8), %eax +; X86-SSE-NEXT: movl %eax, (%esp) +; X86-SSE-NEXT: ldmxcsr (%esp) +; X86-SSE-NEXT: popl %eax +; X86-SSE-NEXT: retl +; +; X64-LABEL: func_05: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: leal 4(%rdi,%rdi), %ecx +; X64-NEXT: movl $201, %eax +; X64-NEXT: # kill: def $cl killed $cl killed $ecx +; X64-NEXT: shll %cl, %eax +; X64-NEXT: andl $3072, %eax # imm = 0xC00 +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movl $-3073, %ecx # imm = 0xF3FF +; X64-NEXT: andl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: movl $-24577, %ecx # imm = 0x9FFF +; X64-NEXT: andl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: leal (%rcx,%rax,8), %eax +; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; X64-NEXT: retq + call void @llvm.set.rounding(i32 %x) ; Downward + ret void +}