diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -740,6 +740,9 @@ // User level interrupts - testui TESTUI, + // Perform an FP80 add after changing precision control in FPCW. + FP80_ADD, + /// X86 strict FP compare instructions. STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCMPS, @@ -779,6 +782,9 @@ STRICT_CVTPS2PH, STRICT_CVTPH2PS, + // Perform an FP80 add after changing precision control in FPCW. + STRICT_FP80_ADD, + // WARNING: Only add nodes here if they are strict FP nodes. Non-memory and // non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE. @@ -886,7 +892,8 @@ AESDECWIDE256KL, /// Compare and Add if Condition is Met. Compare value in operand 2 with - /// value in memory of operand 1. If condition of operand 4 is met, add value + /// value in memory of operand 1. If condition of operand 4 is met, add + /// value /// operand 3 to m32 and write new value in operand 1. Operand 2 is /// always updated with the original value from operand 1. CMPCCXADD, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -22000,15 +22000,20 @@ // Extend everything to 80 bits to force it to be done on x87. // TODO: Are there any fast-math-flags to propagate here? if (IsStrict) { - SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other}, - {Chain, Fild, Fudge}); + unsigned Opc = Subtarget.isOSWindows() && DstVT == MVT::f32 + ? X86ISD::STRICT_FP80_ADD + : ISD::STRICT_FADD; + SDValue Add = + DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge}); // STRICT_FP_ROUND can't handle equal types. if (DstVT == MVT::f80) return Add; return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other}, {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)}); } - SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); + unsigned Opc = Subtarget.isOSWindows() && DstVT == MVT::f32 ? X86ISD::FP80_ADD + : ISD::FADD; + SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); } @@ -34803,6 +34808,8 @@ NODE_NAME_CASE(AESDECWIDE256KL) NODE_NAME_CASE(CMPCCXADD) NODE_NAME_CASE(TESTUI) + NODE_NAME_CASE(FP80_ADD) + NODE_NAME_CASE(STRICT_FP80_ADD) } return nullptr; #undef NODE_NAME_CASE @@ -37313,6 +37320,69 @@ return BB; } + case X86::FP80_ADDr: + case X86::FP80_ADDm32: { + // Change the floating point control register to use double extended + // precision when performing the addition. + int OrigCWFrameIdx = + MF->getFrameInfo().CreateStackObject(2, Align(2), false); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)), + OrigCWFrameIdx); + + // Load the old value of the control word... + Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW), + OrigCWFrameIdx); + + // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended + // precision. + Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); + BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW) + .addReg(OldCW, RegState::Kill) + .addImm(0x300); + + // Extract to 16 bits. + Register NewCW16 = + MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16) + .addReg(NewCW, RegState::Kill, X86::sub_16bit); + + // Prepare memory for FLDCW. + int NewCWFrameIdx = + MF->getFrameInfo().CreateStackObject(2, Align(2), false); + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), + NewCWFrameIdx) + .addReg(NewCW16, RegState::Kill); + + // Reload the modified control word now... + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), + NewCWFrameIdx); + + // Do the addition. + if (MI.getOpcode() == X86::FP80_ADDr) { + BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)); + } else { + BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)) + .add(MI.getOperand(4)) + .add(MI.getOperand(5)) + .add(MI.getOperand(6)); + } + + // Reload the original control word now. + addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), + OrigCWFrameIdx); + + MI.eraseFromParent(); // The pseudo instruction is gone now. + return BB; + } + case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td --- a/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/llvm/lib/Target/X86/X86InstrFPStack.td @@ -26,6 +26,13 @@ def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def SDTX86CwdLoad : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; +def X86fp80_add : SDNode<"X86ISD::FP80_ADD", SDTFPBinOp, [SDNPCommutative]>; +def X86strict_fp80_add : SDNode<"X86ISD::STRICT_FP80_ADD", SDTFPBinOp, + [SDNPHasChain,SDNPCommutative]>; +def any_X86fp80_add : PatFrags<(ops node:$lhs, node:$rhs), + [(X86strict_fp80_add node:$lhs, node:$rhs), + (X86fp80_add node:$lhs, node:$rhs)]>; + def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; def X86fst : SDNode<"X86ISD::FST", SDTX86Fst, @@ -141,6 +148,14 @@ [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>; def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src), [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>; + + def FP80_ADDr : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), + [(set RFP80:$dst, + (any_X86fp80_add RFP80:$src1, RFP80:$src2))]>; + def FP80_ADDm32 : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), + [(set RFP80:$dst, + (any_X86fp80_add RFP80:$src1, + (f80 (extloadf32 addr:$src2))))]>; } // All FP Stack operations are represented with four instructions here. The diff --git a/llvm/test/CodeGen/X86/uint64-to-float.ll b/llvm/test/CodeGen/X86/uint64-to-float.ll --- a/llvm/test/CodeGen/X86/uint64-to-float.ll +++ b/llvm/test/CodeGen/X86/uint64-to-float.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86-WIN +; RUN: llc < %s -mtriple=x86_64-windows -mattr=+sse2 | FileCheck %s --check-prefix=X64-WIN ; Verify that we are using the efficient uitofp --> sitofp lowering illustrated ; by the compiler_rt implementation of __floatundisf. @@ -42,6 +44,48 @@ ; X64-NEXT: cvtsi2ss %rdi, %xmm0 ; X64-NEXT: addss %xmm0, %xmm0 ; X64-NEXT: retq +; +; X86-WIN-LABEL: test: +; X86-WIN: # %bb.0: # %entry +; X86-WIN-NEXT: pushl %ebp +; X86-WIN-NEXT: movl %esp, %ebp +; X86-WIN-NEXT: andl $-8, %esp +; X86-WIN-NEXT: subl $24, %esp +; X86-WIN-NEXT: movl 12(%ebp), %eax +; X86-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-WIN-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; X86-WIN-NEXT: shrl $31, %eax +; X86-WIN-NEXT: fildll {{[0-9]+}}(%esp) +; X86-WIN-NEXT: fnstcw {{[0-9]+}}(%esp) +; X86-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-WIN-NEXT: orl $768, %ecx # imm = 0x300 +; X86-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-WIN-NEXT: fadds __real@5f80000000000000(,%eax,4) +; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-WIN-NEXT: fstps {{[0-9]+}}(%esp) +; X86-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-WIN-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X86-WIN-NEXT: flds {{[0-9]+}}(%esp) +; X86-WIN-NEXT: movl %ebp, %esp +; X86-WIN-NEXT: popl %ebp +; X86-WIN-NEXT: retl +; +; X64-WIN-LABEL: test: +; X64-WIN: # %bb.0: # %entry +; X64-WIN-NEXT: testq %rcx, %rcx +; X64-WIN-NEXT: js .LBB0_1 +; X64-WIN-NEXT: # %bb.2: # %entry +; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0 +; X64-WIN-NEXT: retq +; X64-WIN-NEXT: .LBB0_1: +; X64-WIN-NEXT: movq %rcx, %rax +; X64-WIN-NEXT: shrq %rax +; X64-WIN-NEXT: andl $1, %ecx +; X64-WIN-NEXT: orq %rax, %rcx +; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0 +; X64-WIN-NEXT: addss %xmm0, %xmm0 +; X64-WIN-NEXT: retq entry: %b = uitofp i64 %a to float ret float %b