diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1342,6 +1342,12 @@ SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, + RTLIB::Libcall Call) const; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -331,6 +331,7 @@ setOperationAction(ISD::FREM , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); + setOperationAction(ISD::FREM , MVT::f128 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter @@ -377,15 +378,19 @@ // There's never any support for operations beyond MVT::f32. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); + setTruncStoreAction(MVT::f128, MVT::f16, Expand); if (Subtarget.hasPOPCNT()) { setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); @@ -619,19 +624,8 @@ setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); - // Long double always uses X87, except f128 in SSE. + // f80 always uses X87. if (UseX87) { - if (Subtarget.is64Bit() && Subtarget.hasSSE1()) { - addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass - : &X86::VR128RegClass); - ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); - setOperationAction(ISD::FABS , MVT::f128, Custom); - setOperationAction(ISD::FNEG , MVT::f128, Custom); - setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); - - addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps - } - addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -667,10 +661,60 @@ setOperationAction(ISD::LLRINT, MVT::f80, Expand); } + // f128 uses xmm registers, but most operations require libcalls. + if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) { + addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass + : &X86::VR128RegClass); + + addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps + + setOperationAction(ISD::FADD, MVT::f128, Custom); + setOperationAction(ISD::FSUB, MVT::f128, Custom); + setOperationAction(ISD::FDIV, MVT::f128, Custom); + setOperationAction(ISD::FMUL, MVT::f128, Custom); + setOperationAction(ISD::FMA, MVT::f128, Expand); + + setOperationAction(ISD::FABS, MVT::f128, Custom); + setOperationAction(ISD::FNEG, MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + + setOperationAction(ISD::FSIN, MVT::f128, Expand); + setOperationAction(ISD::FCOS, MVT::f128, Expand); + setOperationAction(ISD::FSINCOS, MVT::f128, Expand); + setOperationAction(ISD::FSQRT, MVT::f128, Expand); + + setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + // We need to custom handle any FP_ROUND with an ff128 input, but + // LegalizeDAG uses the result type to know when to run a custom handler. + // So we have to list all legal floating point result types here. + if (isTypeLegal(MVT::f32)) { + setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); + } + if (isTypeLegal(MVT::f64)) { + setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); + } + if (isTypeLegal(MVT::f80)) { + setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); + } + + setOperationAction(ISD::SETCC, MVT::f128, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand); + setTruncStoreAction(MVT::f128, MVT::f32, Expand); + setTruncStoreAction(MVT::f128, MVT::f64, Expand); + setTruncStoreAction(MVT::f128, MVT::f80, Expand); + } + // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); + setOperationAction(ISD::FPOW , MVT::f128 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); @@ -780,6 +824,8 @@ setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); + + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom); } if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { @@ -1133,6 +1179,8 @@ setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom); + if (!Subtarget.hasAVX512()) setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); @@ -1394,6 +1442,8 @@ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); @@ -4655,6 +4705,10 @@ // X < 0 -> X == 0, jump on sign. return X86::COND_S; } + if (SetCCOpcode == ISD::SETGE && RHSC->isNullValue()) { + // X >= 0 -> X == 0, jump on !sign. + return X86::COND_NS; + } if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); @@ -18269,6 +18323,9 @@ MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); + if (VT == MVT::f128) + return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); + if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; @@ -18628,16 +18685,18 @@ SDValue N0 = Op.getOperand(0); SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); + MVT SrcVT = N0.getSimpleValueType(); + MVT DstVT = Op.getSimpleValueType(); + + if (DstVT == MVT::f128) + return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); - if (Op.getSimpleValueType().isVector()) + if (SrcVT.isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) return Extract; - MVT SrcVT = N0.getSimpleValueType(); - MVT DstVT = Op.getSimpleValueType(); - if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) { // Conversions from unsigned i32 to f32/f64 are legal, @@ -19365,6 +19424,17 @@ MVT SrcVT = Src.getSimpleValueType(); SDLoc dl(Op); + if (SrcVT == MVT::f128) { + RTLIB::Libcall LC; + if (Op.getOpcode() == ISD::FP_TO_SINT) + LC = RTLIB::getFPTOSINT(SrcVT, VT); + else + LC = RTLIB::getFPTOUINT(SrcVT, VT); + + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first; + } + if (VT.isVector()) { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; @@ -19440,12 +19510,17 @@ llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases."); } -static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); + if (VT == MVT::f128) { + RTLIB::Libcall LC; LC = RTLIB::getFPEXT(SVT, VT); + return LowerF128Call(Op, DAG, LC); + } + assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); return DAG.getNode(X86ISD::VFPEXT, DL, VT, @@ -19453,6 +19528,33 @@ In, DAG.getUNDEF(SVT))); } +SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT SVT = In.getSimpleValueType(); + + // It's legal except when f128 is involved + if (SVT != MVT::f128) + return Op; + + RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT); + + // FP_ROUND node has a second operand indicating whether it is known to be + // precise. That doesn't take part in the LibCall so we can't directly use + // LowerF128Call. + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first; +} + +// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking +// the default expansion of STRICT_FP_ROUND. +static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) { + // FIXME: Need to form a libcall with an input chain for f128. + assert(Op.getOperand(0).getValueType() != MVT::f128 && + "Don't know how to handle f128 yet!"); + return Op; +} + /// Horizontal vector math instructions may be slower than normal math with /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch /// implementation, and likely shuffle complexity of the alternate sequence. @@ -19537,8 +19639,13 @@ /// Depending on uarch and/or optimizing for size, we might prefer to use a /// vector operation in place of the typical scalar operation. -static SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() == MVT::f128) { + RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128 + : RTLIB::SUB_F128; + return LowerF128Call(Op, DAG, LC); + } + assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && "Only expecting float/double"); return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); @@ -20868,6 +20975,19 @@ SDLoc dl(Op); ISD::CondCode CC = cast(Op.getOperand(2))->get(); + // Handle f128 first, since one possible outcome is a normal integer + // comparison which gets handled by emitFlagsForSetcc. + if (Op0.getValueType() == MVT::f128) { + softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1); + + // If softenSetCCOperands returned a scalar, use it. + if (!Op1.getNode()) { + assert(Op0.getValueType() == Op.getValueType() && + "Unexpected setcc expansion!"); + return Op0; + } + } + SDValue X86CC; SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC); if (!EFLAGS) @@ -27553,6 +27673,13 @@ return NOOP; } +SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, + RTLIB::Libcall Call) const { + SmallVector Ops(Op->op_begin(), Op->op_end()); + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; +} + /// Provide custom lowering hooks for some operations. SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -27599,10 +27726,14 @@ case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); + case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG); case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FADD: - case ISD::FSUB: return lowerFaddFsub(Op, DAG, Subtarget); + case ISD::FSUB: return lowerFaddFsub(Op, DAG); + case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); + case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll --- a/llvm/test/CodeGen/X86/fp128-cast.ll +++ b/llvm/test/CodeGen/X86/fp128-cast.ll @@ -505,15 +505,11 @@ define void @TestFPTruncF128_F80() nounwind { ; X64-SSE-LABEL: TestFPTruncF128_F80: ; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: subq $24, %rsp +; X64-SSE-NEXT: pushq %rax ; X64-SSE-NEXT: movaps {{.*}}(%rip), %xmm0 ; X64-SSE-NEXT: callq __trunctfxf2 -; X64-SSE-NEXT: fstpt (%rsp) -; X64-SSE-NEXT: movq (%rsp), %rax -; X64-SSE-NEXT: movq %rax, {{.*}}(%rip) -; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax -; X64-SSE-NEXT: movw %ax, vf80+{{.*}}(%rip) -; X64-SSE-NEXT: addq $24, %rsp +; X64-SSE-NEXT: fstpt {{.*}}(%rip) +; X64-SSE-NEXT: popq %rax ; X64-SSE-NEXT: retq ; ; X32-LABEL: TestFPTruncF128_F80: @@ -531,15 +527,11 @@ ; ; X64-AVX-LABEL: TestFPTruncF128_F80: ; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: subq $24, %rsp +; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 ; X64-AVX-NEXT: callq __trunctfxf2 -; X64-AVX-NEXT: fstpt (%rsp) -; X64-AVX-NEXT: movq (%rsp), %rax -; X64-AVX-NEXT: movq %rax, {{.*}}(%rip) -; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax -; X64-AVX-NEXT: movw %ax, vf80+{{.*}}(%rip) -; X64-AVX-NEXT: addq $24, %rsp +; X64-AVX-NEXT: fstpt {{.*}}(%rip) +; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: retq entry: %0 = load fp128, fp128* @vf128, align 16 diff --git a/llvm/test/CodeGen/X86/fp128-compare.ll b/llvm/test/CodeGen/X86/fp128-compare.ll --- a/llvm/test/CodeGen/X86/fp128-compare.ll +++ b/llvm/test/CodeGen/X86/fp128-compare.ll @@ -48,7 +48,10 @@ ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: callq __lttf2 -; CHECK-NEXT: shrl $31, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: testl %eax, %eax +; CHECK-NEXT: sets %cl +; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: popq %rcx ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -56,9 +59,9 @@ %cmp = fcmp olt fp128 %d1, %d2 %conv = zext i1 %cmp to i32 ret i32 %conv -; The 'shrl' is a special optimization in llvm to combine -; the effect of 'fcmp olt' and 'zext'. The main purpose is -; to test soften call to __lttf2. +; FIXME: This used to generate a shrl to move the sign bit of eax into bit 0. +; This no longer happens with fp128 compares being expanded by LegalizeDAG. +; We can add a new DAG combine for X86ISD::CMP/SETCC to restore this. } define i32 @TestComp128LE(fp128 %d1, fp128 %d2) { diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll --- a/llvm/test/CodeGen/X86/fp128-i128.ll +++ b/llvm/test/CodeGen/X86/fp128-i128.ll @@ -160,11 +160,14 @@ ; AVX-NEXT: vmovaps (%rsp), %xmm0 ; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm1 ; AVX-NEXT: callq __lttf2 -; AVX-NEXT: xorl %ecx, %ecx ; AVX-NEXT: testl %eax, %eax -; AVX-NEXT: sets %cl -; AVX-NEXT: shlq $4, %rcx -; AVX-NEXT: vmovaps {{\.LCPI.*}}(%rcx), %xmm0 +; AVX-NEXT: js .LBB2_1 +; AVX-NEXT: # %bb.2: # %entry +; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 +; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: retq +; AVX-NEXT: .LBB2_1: +; AVX-NEXT: vmovaps {{.*}}(%rip), %xmm0 ; AVX-NEXT: addq $40, %rsp ; AVX-NEXT: retq entry: