# Changeset View

Changeset View

# Standalone View

Standalone View

# llvm/lib/Target/X86/X86ISelLowering.cpp

- This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 23,142 Lines • ▼ Show 20 Lines | |||||

/// The minimum architected relative accuracy is 2^-12. We need one | /// The minimum architected relative accuracy is 2^-12. We need one | ||||

/// Newton-Raphson step to have a good float result (24 bits of precision). | /// Newton-Raphson step to have a good float result (24 bits of precision). | ||||

SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, | SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, | ||||

SelectionDAG &DAG, int Enabled, | SelectionDAG &DAG, int Enabled, | ||||

int &RefinementSteps, | int &RefinementSteps, | ||||

bool &UseOneConstNR, | bool &UseOneConstNR, | ||||

bool Reciprocal) const { | bool Reciprocal) const { | ||||

SDLoc DL(Op); | |||||

EVT VT = Op.getValueType(); | EVT VT = Op.getValueType(); | ||||

// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. | // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. | ||||

// It is likely not profitable to do this for f64 because a double-precision | // It is likely not profitable to do this for f64 because a double-precision | ||||

// rsqrt estimate with refinement on x86 prior to FMA requires at least 16 | // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 | ||||

// instructions: convert to single, rsqrtss, convert back to double, refine | // instructions: convert to single, rsqrtss, convert back to double, refine | ||||

// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA | // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA | ||||

// along with FMA, this could be a throughput win. | // along with FMA, this could be a throughput win. | ||||

// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32 | // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32 | ||||

// after legalize types. | // after legalize types. | ||||

if ((VT == MVT::f32 && Subtarget.hasSSE1()) || | if ((VT == MVT::f32 && Subtarget.hasSSE1()) || | ||||

(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) || | (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) || | ||||

(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) || | (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) || | ||||

(VT == MVT::v8f32 && Subtarget.hasAVX()) || | (VT == MVT::v8f32 && Subtarget.hasAVX()) || | ||||

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { | (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { | ||||

if (RefinementSteps == ReciprocalEstimate::Unspecified) | if (RefinementSteps == ReciprocalEstimate::Unspecified) | ||||

RefinementSteps = 1; | RefinementSteps = 1; | ||||

UseOneConstNR = false; | UseOneConstNR = false; | ||||

// There is no FSQRT for 512-bits, but there is RSQRT14. | // There is no FSQRT for 512-bits, but there is RSQRT14. | ||||

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT; | unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT; | ||||

return DAG.getNode(Opcode, SDLoc(Op), VT, Op); | return DAG.getNode(Opcode, DL, VT, Op); | ||||

} | |||||

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) && | |||||

Subtarget.hasFP16()) { | |||||

craig.topper: Should we keep the user RefinementStep if it isn't ReciprocalEstimate::Unspecified? | |||||

Yes, we should respect user RefinementStep here. Thanks Craig. pengfei: Yes, we should respect user RefinementStep here. Thanks Craig. | |||||

if (RefinementSteps == ReciprocalEstimate::Unspecified) | |||||

RefinementSteps = 0; | |||||

if (VT == MVT::f16) { | |||||

SDValue Zero = DAG.getIntPtrConstant(0, DL); | |||||

This is a SCALAR_TO_VECTOR node RKSimon: This is a SCALAR_TO_VECTOR node | |||||

SDValue Undef = DAG.getUNDEF(MVT::v8f16); | |||||

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op); | |||||

Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op); | |||||

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero); | |||||

} | |||||

return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op); | |||||

} | } | ||||

return SDValue(); | return SDValue(); | ||||

} | } | ||||

/// The minimum architected relative accuracy is 2^-12. We need one | /// The minimum architected relative accuracy is 2^-12. We need one | ||||

/// Newton-Raphson step to have a good float result (24 bits of precision). | /// Newton-Raphson step to have a good float result (24 bits of precision). | ||||

SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, | SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, | ||||

int Enabled, | int Enabled, | ||||

int &RefinementSteps) const { | int &RefinementSteps) const { | ||||

SDLoc DL(Op); | |||||

EVT VT = Op.getValueType(); | EVT VT = Op.getValueType(); | ||||

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. | // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. | ||||

// It is likely not profitable to do this for f64 because a double-precision | // It is likely not profitable to do this for f64 because a double-precision | ||||

// reciprocal estimate with refinement on x86 prior to FMA requires | // reciprocal estimate with refinement on x86 prior to FMA requires | ||||

// 15 instructions: convert to single, rcpss, convert back to double, refine | // 15 instructions: convert to single, rcpss, convert back to double, refine | ||||

// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA | // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA | ||||

// along with FMA, this could be a throughput win. | // along with FMA, this could be a throughput win. | ||||

if ((VT == MVT::f32 && Subtarget.hasSSE1()) || | if ((VT == MVT::f32 && Subtarget.hasSSE1()) || | ||||

(VT == MVT::v4f32 && Subtarget.hasSSE1()) || | (VT == MVT::v4f32 && Subtarget.hasSSE1()) || | ||||

(VT == MVT::v8f32 && Subtarget.hasAVX()) || | (VT == MVT::v8f32 && Subtarget.hasAVX()) || | ||||

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { | (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { | ||||

// Enable estimate codegen with 1 refinement step for vector division. | // Enable estimate codegen with 1 refinement step for vector division. | ||||

// Scalar division estimates are disabled because they break too much | // Scalar division estimates are disabled because they break too much | ||||

// real-world code. These defaults are intended to match GCC behavior. | // real-world code. These defaults are intended to match GCC behavior. | ||||

if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified) | if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified) | ||||

return SDValue(); | return SDValue(); | ||||

if (RefinementSteps == ReciprocalEstimate::Unspecified) | if (RefinementSteps == ReciprocalEstimate::Unspecified) | ||||

RefinementSteps = 1; | RefinementSteps = 1; | ||||

// There is no FSQRT for 512-bits, but there is RCP14. | // There is no FSQRT for 512-bits, but there is RCP14. | ||||

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP; | unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP; | ||||

return DAG.getNode(Opcode, SDLoc(Op), VT, Op); | return DAG.getNode(Opcode, DL, VT, Op); | ||||

} | |||||

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) && | |||||

Not Done ReplyInline ActionsYou need to know the type is legal. v64f16 will get through this, but type legalization won't be able to split a v64f16 X86ISD::RCP14 node. craig.topper: You need to know the type is legal. v64f16 will get through this, but type legalization won't… | |||||

Yes. Thanks Craig! pengfei: Yes. Thanks Craig! | |||||

Subtarget.hasFP16()) { | |||||

Same question as above. craig.topper: Same question as above. | |||||

if (RefinementSteps == ReciprocalEstimate::Unspecified) | |||||

RefinementSteps = 0; | |||||

if (VT == MVT::f16) { | |||||

SDValue Zero = DAG.getIntPtrConstant(0, DL); | |||||

SCALAR_TO_VECTOR RKSimon: SCALAR_TO_VECTOR | |||||

SDValue Undef = DAG.getUNDEF(MVT::v8f16); | |||||

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op); | |||||

Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op); | |||||

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero); | |||||

} | |||||

return DAG.getNode(X86ISD::RCP14, DL, VT, Op); | |||||

} | } | ||||

return SDValue(); | return SDValue(); | ||||

} | } | ||||

/// If we have at least two divisions that use the same divisor, convert to | /// If we have at least two divisions that use the same divisor, convert to | ||||

/// multiplication by a reciprocal. This may need to be adjusted for a given | /// multiplication by a reciprocal. This may need to be adjusted for a given | ||||

/// CPU if a division's cost is not at least twice the cost of a multiplication. | /// CPU if a division's cost is not at least twice the cost of a multiplication. | ||||

/// This is because we still need one division to calculate the reciprocal and | /// This is because we still need one division to calculate the reciprocal and | ||||

▲ Show 20 Lines • Show All 30,809 Lines • Show Last 20 Lines |

Should we keep the user RefinementStep if it isn't ReciprocalEstimate::Unspecified?