Index: llvm/trunk/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/TargetLowering.h +++ llvm/trunk/include/llvm/CodeGen/TargetLowering.h @@ -3365,6 +3365,18 @@ llvm_unreachable("Not Implemented"); } + /// Return 1 if we can compute the negated form of the specified expression + /// for the same cost as the expression itself, or 2 if we can compute the + /// negated form more cheaply than the expression itself. Else return 0. + virtual char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, bool ForCodeSize, + unsigned Depth = 0) const; + + /// If isNegatibleForFree returns true, return the newly negated expression. + virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, bool ForCodeSize, + unsigned Depth = 0) const; + //===--------------------------------------------------------------------===// // Lowering methods - These methods must be implemented by targets so that // the SelectionDAGBuilder code knows how to lower these. Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -785,252 +785,6 @@ DAG.DeleteNode(N); } -/// Return 1 if we can compute the negated form of the specified expression for -/// the same cost as the expression itself, or 2 if we can compute the negated -/// form more cheaply than the expression itself. -static char isNegatibleForFree(SDValue Op, bool LegalOperations, - const TargetLowering &TLI, - const TargetOptions *Options, - bool ForCodeSize, - unsigned Depth = 0) { - // fneg is removable even if it has multiple uses. - if (Op.getOpcode() == ISD::FNEG) - return 2; - - // Don't allow anything with multiple uses unless we know it is free. - EVT VT = Op.getValueType(); - const SDNodeFlags Flags = Op->getFlags(); - if (!Op.hasOneUse() && - !(Op.getOpcode() == ISD::FP_EXTEND && - TLI.isFPExtFree(VT, Op.getOperand(0).getValueType()))) - return 0; - - // Don't recurse exponentially. - if (Depth > SelectionDAG::MaxRecursionDepth) - return 0; - - switch (Op.getOpcode()) { - default: return false; - case ISD::ConstantFP: { - if (!LegalOperations) - return 1; - - // Don't invert constant FP values after legalization unless the target says - // the negated constant is legal. - return TLI.isOperationLegal(ISD::ConstantFP, VT) || - TLI.isFPImmLegal(neg(cast(Op)->getValueAPF()), VT, - ForCodeSize); - } - case ISD::BUILD_VECTOR: { - // Only permit BUILD_VECTOR of constants. - if (llvm::any_of(Op->op_values(), [&](SDValue N) { - return !N.isUndef() && !isa(N); - })) - return 0; - if (!LegalOperations) - return 1; - if (TLI.isOperationLegal(ISD::ConstantFP, VT) && - TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) - return 1; - return llvm::all_of(Op->op_values(), [&](SDValue N) { - return N.isUndef() || - TLI.isFPImmLegal(neg(cast(N)->getValueAPF()), VT, - ForCodeSize); - }); - } - case ISD::FADD: - if (!Options->NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) - return 0; - - // After operation legalization, it might not be legal to create new FSUBs. - if (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) - return 0; - - // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) - if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, - Options, ForCodeSize, Depth + 1)) - return V; - // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) - return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options, - ForCodeSize, Depth + 1); - case ISD::FSUB: - // We can't turn -(A-B) into B-A when we honor signed zeros. - if (!Options->NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) - return 0; - - // fold (fneg (fsub A, B)) -> (fsub B, A) - return 1; - - case ISD::FMUL: - case ISD::FDIV: - // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y)) - if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, - Options, ForCodeSize, Depth + 1)) - return V; - - // Ignore X * 2.0 because that is expected to be canonicalized to X + X. - if (auto *C = isConstOrConstSplatFP(Op.getOperand(1))) - if (C->isExactlyValue(2.0) && Op.getOpcode() == ISD::FMUL) - return 0; - - return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options, - ForCodeSize, Depth + 1); - - case ISD::FMA: - case ISD::FMAD: { - if (!Options->NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) - return 0; - - // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z)) - // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z)) - char V2 = isNegatibleForFree(Op.getOperand(2), LegalOperations, TLI, - Options, ForCodeSize, Depth + 1); - if (!V2) - return 0; - - // One of Op0/Op1 must be cheaply negatible, then select the cheapest. - char V0 = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, - Options, ForCodeSize, Depth + 1); - char V1 = isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, - Options, ForCodeSize, Depth + 1); - char V01 = std::max(V0, V1); - return V01 ? std::max(V01, V2) : 0; - } - - case ISD::FP_EXTEND: - case ISD::FP_ROUND: - case ISD::FSIN: - return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, - ForCodeSize, Depth + 1); - } -} - -/// If isNegatibleForFree returns true, return the newly negated expression. -static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, - bool LegalOperations, bool ForCodeSize, - unsigned Depth = 0) { - // fneg is removable even if it has multiple uses. - if (Op.getOpcode() == ISD::FNEG) - return Op.getOperand(0); - - assert(Depth <= SelectionDAG::MaxRecursionDepth && - "GetNegatedExpression doesn't match isNegatibleForFree"); - const TargetOptions &Options = DAG.getTarget().Options; - const SDNodeFlags Flags = Op->getFlags(); - - switch (Op.getOpcode()) { - default: llvm_unreachable("Unknown code"); - case ISD::ConstantFP: { - APFloat V = cast(Op)->getValueAPF(); - V.changeSign(); - return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType()); - } - case ISD::BUILD_VECTOR: { - SmallVector Ops; - for (SDValue C : Op->op_values()) { - if (C.isUndef()) { - Ops.push_back(C); - continue; - } - APFloat V = cast(C)->getValueAPF(); - V.changeSign(); - Ops.push_back(DAG.getConstantFP(V, SDLoc(Op), C.getValueType())); - } - return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Ops); - } - case ISD::FADD: - assert((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) && - "Expected NSZ fp-flag"); - - // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) - if (isNegatibleForFree(Op.getOperand(0), LegalOperations, - DAG.getTargetLoweringInfo(), &Options, ForCodeSize, - Depth + 1)) - return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), - GetNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1), - Op.getOperand(1), Flags); - // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) - return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), - GetNegatedExpression(Op.getOperand(1), DAG, - LegalOperations, ForCodeSize, - Depth + 1), - Op.getOperand(0), Flags); - case ISD::FSUB: - // fold (fneg (fsub 0, B)) -> B - if (ConstantFPSDNode *N0CFP = - isConstOrConstSplatFP(Op.getOperand(0), /*AllowUndefs*/ true)) - if (N0CFP->isZero()) - return Op.getOperand(1); - - // fold (fneg (fsub A, B)) -> (fsub B, A) - return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), - Op.getOperand(1), Op.getOperand(0), Flags); - - case ISD::FMUL: - case ISD::FDIV: - // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) - if (isNegatibleForFree(Op.getOperand(0), LegalOperations, - DAG.getTargetLoweringInfo(), &Options, ForCodeSize, - Depth + 1)) - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - GetNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1), - Op.getOperand(1), Flags); - - // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y)) - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - Op.getOperand(0), - GetNegatedExpression(Op.getOperand(1), DAG, - LegalOperations, ForCodeSize, - Depth + 1), Flags); - - case ISD::FMA: - case ISD::FMAD: { - assert((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) && - "Expected NSZ fp-flag"); - - SDValue Neg2 = GetNegatedExpression(Op.getOperand(2), DAG, LegalOperations, - ForCodeSize, Depth + 1); - - char V0 = isNegatibleForFree(Op.getOperand(0), LegalOperations, - DAG.getTargetLoweringInfo(), &Options, - ForCodeSize, Depth + 1); - char V1 = isNegatibleForFree(Op.getOperand(1), LegalOperations, - DAG.getTargetLoweringInfo(), &Options, - ForCodeSize, Depth + 1); - if (V0 >= V1) { - // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z)) - SDValue Neg0 = GetNegatedExpression( - Op.getOperand(0), DAG, LegalOperations, ForCodeSize, Depth + 1); - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Neg0, - Op.getOperand(1), Neg2, Flags); - } - - // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z)) - SDValue Neg1 = GetNegatedExpression(Op.getOperand(1), DAG, LegalOperations, - ForCodeSize, Depth + 1); - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - Op.getOperand(0), Neg1, Neg2, Flags); - } - - case ISD::FP_EXTEND: - case ISD::FSIN: - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - GetNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1)); - case ISD::FP_ROUND: - return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(), - GetNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1), - Op.getOperand(1)); - } -} - // APInts must be the same size for most operations, this helper // function zero extends the shorter of the pair so that they match. // We provide an Offset so that we can create bitwidths that won't overflow. @@ -12052,17 +11806,17 @@ // fold (fadd A, (fneg B)) -> (fsub A, B) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && - isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize) == 2) - return DAG.getNode(ISD::FSUB, DL, VT, N0, - GetNegatedExpression(N1, DAG, LegalOperations, - ForCodeSize), Flags); + TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize) == 2) + return DAG.getNode( + ISD::FSUB, DL, VT, N0, + TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags); // fold (fadd (fneg A), B) -> (fsub B, A) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && - isNegatibleForFree(N0, LegalOperations, TLI, &Options, ForCodeSize) == 2) - return DAG.getNode(ISD::FSUB, DL, VT, N1, - GetNegatedExpression(N0, DAG, LegalOperations, - ForCodeSize), Flags); + TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize) == 2) + return DAG.getNode( + ISD::FSUB, DL, VT, N1, + TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize), Flags); auto isFMulNegTwo = [](SDValue FMul) { if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL) @@ -12241,16 +11995,16 @@ if (N0CFP && N0CFP->isZero()) { if (N0CFP->isNegative() || (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) { - if (isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize)) - return GetNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); + if (TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize)) + return TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags); } } if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) || - (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) - && N1.getOpcode() == ISD::FADD) { + (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && + N1.getOpcode() == ISD::FADD) { // X - (X + Y) -> -Y if (N0 == N1->getOperand(0)) return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1), Flags); @@ -12260,10 +12014,10 @@ } // fold (fsub A, (fneg B)) -> (fadd A, B) - if (isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize)) - return DAG.getNode(ISD::FADD, DL, VT, N0, - GetNegatedExpression(N1, DAG, LegalOperations, - ForCodeSize), Flags); + if (TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize)) + return DAG.getNode( + ISD::FADD, DL, VT, N0, + TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags); // FSUB -> FMA combines: if (SDValue Fused = visitFSUBForFMACombine(N)) { @@ -12277,11 +12031,10 @@ /// Return true if both inputs are at least as cheap in negated form and at /// least one input is strictly cheaper in negated form. bool DAGCombiner::isCheaperToUseNegatedFPOps(SDValue X, SDValue Y) { - const TargetOptions &Options = DAG.getTarget().Options; - if (char LHSNeg = isNegatibleForFree(X, LegalOperations, TLI, &Options, - ForCodeSize)) - if (char RHSNeg = isNegatibleForFree(Y, LegalOperations, TLI, &Options, - ForCodeSize)) + if (char LHSNeg = + TLI.isNegatibleForFree(X, DAG, LegalOperations, ForCodeSize)) + if (char RHSNeg = + TLI.isNegatibleForFree(Y, DAG, LegalOperations, ForCodeSize)) // Both negated operands are at least as cheap as their counterparts. // Check to see if at least one is cheaper negated. if (LHSNeg == 2 || RHSNeg == 2) @@ -12362,8 +12115,10 @@ // -N0 * -N1 --> N0 * N1 if (isCheaperToUseNegatedFPOps(N0, N1)) { - SDValue NegN0 = GetNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); - SDValue NegN1 = GetNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); + SDValue NegN0 = + TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); + SDValue NegN1 = + TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags); } @@ -12445,8 +12200,10 @@ // (-N0 * -N1) + N2 --> (N0 * N1) + N2 if (isCheaperToUseNegatedFPOps(N0, N1)) { - SDValue NegN0 = GetNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); - SDValue NegN1 = GetNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); + SDValue NegN0 = + TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); + SDValue NegN1 = + TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags); } @@ -12707,8 +12464,8 @@ if (isCheaperToUseNegatedFPOps(N0, N1)) return DAG.getNode( ISD::FDIV, SDLoc(N), VT, - GetNegatedExpression(N0, DAG, LegalOperations, ForCodeSize), - GetNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags); + TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize), + TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags); return SDValue(); } @@ -13262,9 +13019,8 @@ if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0); - if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(), - &DAG.getTarget().Options, ForCodeSize)) - return GetNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); + if (TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize)) + return TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading // constant pool values. Index: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5331,6 +5331,246 @@ return false; } +char TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, bool ForCodeSize, + unsigned Depth) const { + // fneg is removable even if it has multiple uses. + if (Op.getOpcode() == ISD::FNEG) + return 2; + + // Don't allow anything with multiple uses unless we know it is free. + EVT VT = Op.getValueType(); + const SDNodeFlags Flags = Op->getFlags(); + const TargetOptions &Options = DAG.getTarget().Options; + if (!Op.hasOneUse() && !(Op.getOpcode() == ISD::FP_EXTEND && + isFPExtFree(VT, Op.getOperand(0).getValueType()))) + return 0; + + // Don't recurse exponentially. + if (Depth > SelectionDAG::MaxRecursionDepth) + return 0; + + switch (Op.getOpcode()) { + case ISD::ConstantFP: { + if (!LegalOperations) + return 1; + + // Don't invert constant FP values after legalization unless the target says + // the negated constant is legal. + return isOperationLegal(ISD::ConstantFP, VT) || + isFPImmLegal(neg(cast(Op)->getValueAPF()), VT, + ForCodeSize); + } + case ISD::BUILD_VECTOR: { + // Only permit BUILD_VECTOR of constants. + if (llvm::any_of(Op->op_values(), [&](SDValue N) { + return !N.isUndef() && !isa(N); + })) + return 0; + if (!LegalOperations) + return 1; + if (isOperationLegal(ISD::ConstantFP, VT) && + isOperationLegal(ISD::BUILD_VECTOR, VT)) + return 1; + return llvm::all_of(Op->op_values(), [&](SDValue N) { + return N.isUndef() || + isFPImmLegal(neg(cast(N)->getValueAPF()), VT, + ForCodeSize); + }); + } + case ISD::FADD: + if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + return 0; + + // After operation legalization, it might not be legal to create new FSUBs. + if (LegalOperations && !isOperationLegalOrCustom(ISD::FSUB, VT)) + return 0; + + // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) + if (char V = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, + ForCodeSize, Depth + 1)) + return V; + // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) + return isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations, + ForCodeSize, Depth + 1); + case ISD::FSUB: + // We can't turn -(A-B) into B-A when we honor signed zeros. + if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + return 0; + + // fold (fneg (fsub A, B)) -> (fsub B, A) + return 1; + + case ISD::FMUL: + case ISD::FDIV: + // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y)) + if (char V = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, + ForCodeSize, Depth + 1)) + return V; + + // Ignore X * 2.0 because that is expected to be canonicalized to X + X. + if (auto *C = isConstOrConstSplatFP(Op.getOperand(1))) + if (C->isExactlyValue(2.0) && Op.getOpcode() == ISD::FMUL) + return 0; + + return isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations, + ForCodeSize, Depth + 1); + + case ISD::FMA: + case ISD::FMAD: { + if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + return 0; + + // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z)) + // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z)) + char V2 = isNegatibleForFree(Op.getOperand(2), DAG, LegalOperations, + ForCodeSize, Depth + 1); + if (!V2) + return 0; + + // One of Op0/Op1 must be cheaply negatible, then select the cheapest. + char V0 = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, + ForCodeSize, Depth + 1); + char V1 = isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations, + ForCodeSize, Depth + 1); + char V01 = std::max(V0, V1); + return V01 ? std::max(V01, V2) : 0; + } + + case ISD::FP_EXTEND: + case ISD::FP_ROUND: + case ISD::FSIN: + return isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, + ForCodeSize, Depth + 1); + } + + return 0; +} + +SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, + bool ForCodeSize, + unsigned Depth) const { + // fneg is removable even if it has multiple uses. + if (Op.getOpcode() == ISD::FNEG) + return Op.getOperand(0); + + assert(Depth <= SelectionDAG::MaxRecursionDepth && + "getNegatedExpression doesn't match isNegatibleForFree"); + const SDNodeFlags Flags = Op->getFlags(); + + switch (Op.getOpcode()) { + case ISD::ConstantFP: { + APFloat V = cast(Op)->getValueAPF(); + V.changeSign(); + return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType()); + } + case ISD::BUILD_VECTOR: { + SmallVector Ops; + for (SDValue C : Op->op_values()) { + if (C.isUndef()) { + Ops.push_back(C); + continue; + } + APFloat V = cast(C)->getValueAPF(); + V.changeSign(); + Ops.push_back(DAG.getConstantFP(V, SDLoc(Op), C.getValueType())); + } + return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Ops); + } + case ISD::FADD: + assert((DAG.getTarget().Options.NoSignedZerosFPMath || + Flags.hasNoSignedZeros()) && + "Expected NSZ fp-flag"); + + // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) + if (isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, ForCodeSize, + Depth + 1)) + return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), + getNegatedExpression(Op.getOperand(0), DAG, + LegalOperations, ForCodeSize, + Depth + 1), + Op.getOperand(1), Flags); + // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) + return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), + getNegatedExpression(Op.getOperand(1), DAG, + LegalOperations, ForCodeSize, + Depth + 1), + Op.getOperand(0), Flags); + case ISD::FSUB: + // fold (fneg (fsub 0, B)) -> B + if (ConstantFPSDNode *N0CFP = + isConstOrConstSplatFP(Op.getOperand(0), /*AllowUndefs*/ true)) + if (N0CFP->isZero()) + return Op.getOperand(1); + + // fold (fneg (fsub A, B)) -> (fsub B, A) + return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(0), Flags); + + case ISD::FMUL: + case ISD::FDIV: + // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) + if (isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, ForCodeSize, + Depth + 1)) + return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), + getNegatedExpression(Op.getOperand(0), DAG, + LegalOperations, ForCodeSize, + Depth + 1), + Op.getOperand(1), Flags); + + // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y)) + return DAG.getNode( + Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0), + getNegatedExpression(Op.getOperand(1), DAG, LegalOperations, + ForCodeSize, Depth + 1), + Flags); + + case ISD::FMA: + case ISD::FMAD: { + assert((DAG.getTarget().Options.NoSignedZerosFPMath || + Flags.hasNoSignedZeros()) && + "Expected NSZ fp-flag"); + + SDValue Neg2 = getNegatedExpression(Op.getOperand(2), DAG, LegalOperations, + ForCodeSize, Depth + 1); + + char V0 = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, + ForCodeSize, Depth + 1); + char V1 = isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations, + ForCodeSize, Depth + 1); + if (V0 >= V1) { + // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z)) + SDValue Neg0 = getNegatedExpression( + Op.getOperand(0), DAG, LegalOperations, ForCodeSize, Depth + 1); + return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Neg0, + Op.getOperand(1), Neg2, Flags); + } + + // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z)) + SDValue Neg1 = getNegatedExpression(Op.getOperand(1), DAG, LegalOperations, + ForCodeSize, Depth + 1); + return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), + Op.getOperand(0), Neg1, Neg2, Flags); + } + + case ISD::FP_EXTEND: + case ISD::FSIN: + return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), + getNegatedExpression(Op.getOperand(0), DAG, + LegalOperations, ForCodeSize, + Depth + 1)); + case ISD::FP_ROUND: + return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(), + getNegatedExpression(Op.getOperand(0), DAG, + LegalOperations, ForCodeSize, + Depth + 1), + Op.getOperand(1)); + } + + llvm_unreachable("Unknown code"); +} + //===----------------------------------------------------------------------===// // Legalization Utilities //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -798,6 +798,17 @@ /// and some i16 instructions are slow. bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override; + /// Return 1 if we can compute the negated form of the specified expression + /// for the same cost as the expression itself, or 2 if we can compute the + /// negated form more cheaply than the expression itself. Else return 0. + char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations, + bool ForCodeSize, unsigned Depth) const override; + + /// If isNegatibleForFree returns true, return the newly negated expression. + SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, bool ForCodeSize, + unsigned Depth) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -42038,6 +42038,101 @@ return SDValue(); } +char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, + bool ForCodeSize, + unsigned Depth) const { + // fneg patterns are removable even if they have multiple uses. + if (isFNEG(DAG, Op.getNode())) + return 2; + + // Don't recurse exponentially. + if (Depth > SelectionDAG::MaxRecursionDepth) + return 0; + + EVT VT = Op.getValueType(); + EVT SVT = VT.getScalarType(); + switch (Op.getOpcode()) { + case ISD::FMA: + case X86ISD::FMSUB: + case X86ISD::FNMADD: + case X86ISD::FNMSUB: + case X86ISD::FMADD_RND: + case X86ISD::FMSUB_RND: + case X86ISD::FNMADD_RND: + case X86ISD::FNMSUB_RND: { + if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) || + !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations) + break; + + // This is always negatible for free but we might be able to remove some + // extra operand negations as well. + for (int i = 0; i != 3; ++i) { + char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations, + ForCodeSize, Depth + 1); + if (V == 2) + return V; + } + return 1; + } + } + + return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations, + ForCodeSize, Depth); +} + +SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, + bool LegalOperations, + bool ForCodeSize, + unsigned Depth) const { + // fneg patterns are removable even if they have multiple uses. + if (SDValue Arg = isFNEG(DAG, Op.getNode())) + return DAG.getBitcast(Op.getValueType(), Arg); + + EVT VT = Op.getValueType(); + EVT SVT = VT.getScalarType(); + unsigned Opc = Op.getOpcode(); + switch (Opc) { + case ISD::FMA: + case X86ISD::FMSUB: + case X86ISD::FNMADD: + case X86ISD::FNMSUB: + case X86ISD::FMADD_RND: + case X86ISD::FMSUB_RND: + case X86ISD::FNMADD_RND: + case X86ISD::FNMSUB_RND: { + if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) || + !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations) + break; + + // This is always negatible for free but we might be able to remove some + // extra operand negations as well. + SmallVector NewOps(Op.getNumOperands(), SDValue()); + for (int i = 0; i != 3; ++i) { + char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations, + ForCodeSize, Depth + 1); + if (V == 2) + NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations, + ForCodeSize, Depth + 1); + } + + bool NegA = !!NewOps[0]; + bool NegB = !!NewOps[1]; + bool NegC = !!NewOps[2]; + unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true); + + // Fill in the non-negated ops with the original values. + for (int i = 0, e = Op.getNumOperands(); i != e; ++i) + if (!NewOps[i]) + NewOps[i] = Op.getOperand(i); + return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps); + } + } + + return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, + ForCodeSize, Depth); +} + static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = N->getSimpleValueType(0); @@ -42967,12 +43062,14 @@ } static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { SDLoc dl(N); EVT VT = N->getValueType(0); // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(VT)) return SDValue(); EVT ScalarVT = VT.getScalarType(); @@ -42983,17 +43080,21 @@ SDValue B = N->getOperand(1); SDValue C = N->getOperand(2); - auto invertIfNegative = [&DAG](SDValue &V) { - if (SDValue NegVal = isFNEG(DAG, V.getNode())) { - V = DAG.getBitcast(V.getValueType(), NegVal); + auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) { + bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool LegalOperations = !DCI.isBeforeLegalizeOps(); + if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) { + V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize); return true; } // Look through extract_vector_elts. If it comes from an FNEG, create a // new extract from the FNEG input. if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT && isNullConstant(V.getOperand(1))) { - if (SDValue NegVal = isFNEG(DAG, V.getOperand(0).getNode())) { - NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal); + SDValue Vec = V.getOperand(0); + if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) { + SDValue NegVal = + TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize); V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(), NegVal, V.getOperand(1)); return true; @@ -43023,25 +43124,25 @@ // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C) static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { + TargetLowering::DAGCombinerInfo &DCI) { SDLoc dl(N); EVT VT = N->getValueType(0); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); + bool LegalOperations = !DCI.isBeforeLegalizeOps(); - SDValue NegVal = isFNEG(DAG, N->getOperand(2).getNode()); - if (!NegVal) - return SDValue(); - - // FIXME: Should we bitcast instead? - if (NegVal.getValueType() != VT) + SDValue N2 = N->getOperand(2); + if (!TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize)) return SDValue(); + SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize); unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false); if (N->getNumOperands() == 4) return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), - NegVal, N->getOperand(3)); + NegN2, N->getOperand(3)); return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), - NegVal); + NegN2); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, @@ -45316,11 +45417,11 @@ case X86ISD::FNMADD_RND: case X86ISD::FNMSUB: case X86ISD::FNMSUB_RND: - case ISD::FMA: return combineFMA(N, DAG, Subtarget); + case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget); case X86ISD::FMADDSUB_RND: case X86ISD::FMSUBADD_RND: case X86ISD::FMADDSUB: - case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget); + case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI); case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget); case X86ISD::MGATHER: case X86ISD::MSCATTER: Index: llvm/trunk/test/CodeGen/X86/recip-fastmath.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/recip-fastmath.ll +++ llvm/trunk/test/CodeGen/X86/recip-fastmath.ll @@ -60,15 +60,15 @@ ; FMA-RECIP-LABEL: f32_one_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_one_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 +; BDVER2-NEXT: vfmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 +; BDVER2-NEXT: vfnmaddss %xmm1, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: f32_one_step: @@ -94,8 +94,8 @@ ; HASWELL-LABEL: f32_one_step: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step: @@ -111,8 +111,8 @@ ; AVX512-LABEL: f32_one_step: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 ; AVX512-NEXT: retq %div = fdiv fast float 1.0, %x ret float %div Index: llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll +++ llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll @@ -154,8 +154,8 @@ ; FMA-RECIP-LABEL: f32_one_step_2_divs: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; FMA-RECIP-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 ; FMA-RECIP-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; FMA-RECIP-NEXT: retq @@ -163,8 +163,8 @@ ; BDVER2-LABEL: f32_one_step_2_divs: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 -; BDVER2-NEXT: vfmaddss %xmm1, %xmm0, %xmm1, %xmm0 +; BDVER2-NEXT: vfmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm0 +; BDVER2-NEXT: vfnmaddss %xmm1, %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 ; BDVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; BDVER2-NEXT: retq @@ -196,8 +196,8 @@ ; HASWELL-LABEL: f32_one_step_2_divs: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; HASWELL-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 ; HASWELL-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 ; HASWELL-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; HASWELL-NEXT: retq @@ -217,8 +217,8 @@ ; AVX512-LABEL: f32_one_step_2_divs: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + mem -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1 +; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem +; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 ; AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm1 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq @@ -267,8 +267,8 @@ ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; FMA-RECIP-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; FMA-RECIP-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 +; FMA-RECIP-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 ; FMA-RECIP-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; FMA-RECIP-NEXT: vmulss %xmm1, %xmm2, %xmm3 ; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 @@ -278,9 +278,9 @@ ; BDVER2-LABEL: f32_two_step_2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BDVER2-NEXT: vfnmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm2 +; BDVER2-NEXT: vfmaddss {{.*}}(%rip), %xmm1, %xmm0, %xmm2 ; BDVER2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; BDVER2-NEXT: vfmaddss %xmm1, %xmm2, %xmm1, %xmm1 +; BDVER2-NEXT: vfnmaddss %xmm1, %xmm2, %xmm1, %xmm1 ; BDVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3 ; BDVER2-NEXT: vfnmaddss %xmm4, %xmm3, %xmm0, %xmm0 ; BDVER2-NEXT: vfmaddss %xmm3, %xmm0, %xmm1, %xmm0 @@ -322,8 +322,8 @@ ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; HASWELL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; HASWELL-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 -; HASWELL-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; HASWELL-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 +; HASWELL-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 ; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; HASWELL-NEXT: vmulss %xmm1, %xmm2, %xmm3 ; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 @@ -350,8 +350,8 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX512-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 -; AVX512-NEXT: vfmadd132ss {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; AVX512-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 +; AVX512-NEXT: vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 ; AVX512-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm3 ; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 @@ -610,9 +610,9 @@ ; FMA-RECIP-LABEL: v4f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-RECIP-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 +; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; FMA-RECIP-NEXT: vmulps %xmm1, %xmm2, %xmm3 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 @@ -622,9 +622,9 @@ ; BDVER2-LABEL: v4f32_two_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 -; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm2 +; BDVER2-NEXT: vfmaddps {{.*}}(%rip), %xmm1, %xmm0, %xmm2 ; BDVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] -; BDVER2-NEXT: vfmaddps %xmm1, %xmm2, %xmm1, %xmm1 +; BDVER2-NEXT: vfnmaddps %xmm1, %xmm2, %xmm1, %xmm1 ; BDVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3 ; BDVER2-NEXT: vfnmaddps %xmm4, %xmm3, %xmm0, %xmm0 ; BDVER2-NEXT: vfmaddps %xmm3, %xmm0, %xmm1, %xmm0 @@ -665,9 +665,9 @@ ; HASWELL-LABEL: v4f32_two_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; HASWELL-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 ; HASWELL-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; HASWELL-NEXT: vmulps %xmm1, %xmm2, %xmm3 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 @@ -693,9 +693,9 @@ ; AVX512-LABEL: v4f32_two_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpps %xmm0, %xmm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vfnmadd231ps {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} xmm2 = (xmm2 * xmm1) + xmm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 +; AVX512-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 ; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm3 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm3 * xmm0) + xmm1 @@ -987,9 +987,9 @@ ; FMA-RECIP-LABEL: v8f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1 +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-RECIP-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 +; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; FMA-RECIP-NEXT: vmulps %ymm1, %ymm2, %ymm3 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 @@ -999,9 +999,9 @@ ; BDVER2-LABEL: v8f32_two_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 -; BDVER2-NEXT: vfnmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm2 +; BDVER2-NEXT: vfmaddps {{.*}}(%rip), %ymm1, %ymm0, %ymm2 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; BDVER2-NEXT: vfmaddps %ymm1, %ymm2, %ymm1, %ymm1 +; BDVER2-NEXT: vfnmaddps %ymm1, %ymm2, %ymm1, %ymm1 ; BDVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3 ; BDVER2-NEXT: vfnmaddps %ymm4, %ymm3, %ymm0, %ymm0 ; BDVER2-NEXT: vfmaddps %ymm3, %ymm0, %ymm1, %ymm0 @@ -1042,9 +1042,9 @@ ; HASWELL-LABEL: v8f32_two_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1 +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; HASWELL-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 ; HASWELL-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; HASWELL-NEXT: vmulps %ymm1, %ymm2, %ymm3 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 @@ -1070,9 +1070,9 @@ ; AVX512-LABEL: v8f32_two_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpps %ymm0, %ymm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vfnmadd231ps {{.*#+}} ymm2 = -(ymm0 * ymm1) + ymm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} ymm2 = (ymm2 * ymm1) + ymm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm0 * ymm1) + ymm2 +; AVX512-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 ; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; AVX512-NEXT: vmulps %ymm1, %ymm2, %ymm3 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm3 * ymm0) + ymm1 @@ -1552,17 +1552,17 @@ ; FMA-RECIP-LABEL: v16f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 +; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm4 = (ymm0 * ymm4) + ymm3 +; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; FMA-RECIP-NEXT: vmulps %ymm2, %ymm4, %ymm5 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm5 * ymm0) + ymm2 ; FMA-RECIP-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm5 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 -; FMA-RECIP-NEXT: vfnmadd231ps {{.*#+}} ymm3 = -(ymm1 * ymm2) + ymm3 -; FMA-RECIP-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm2) + ymm2 +; FMA-RECIP-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm1 * ymm2) + ymm3 +; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; FMA-RECIP-NEXT: vmulps %ymm2, %ymm3, %ymm4 ; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm2 @@ -1572,17 +1572,17 @@ ; BDVER2-LABEL: v16f32_two_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm0, %ymm4 -; BDVER2-NEXT: vfmaddps %ymm2, %ymm4, %ymm2, %ymm2 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; BDVER2-NEXT: vfmaddps %ymm3, %ymm2, %ymm0, %ymm4 +; BDVER2-NEXT: vfnmaddps %ymm2, %ymm4, %ymm2, %ymm2 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BDVER2-NEXT: vmulps %ymm4, %ymm2, %ymm5 ; BDVER2-NEXT: vfnmaddps %ymm4, %ymm5, %ymm0, %ymm0 ; BDVER2-NEXT: vfmaddps %ymm5, %ymm0, %ymm2, %ymm0 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] -; BDVER2-NEXT: vfnmaddps %ymm3, %ymm2, %ymm1, %ymm3 -; BDVER2-NEXT: vfmaddps %ymm2, %ymm3, %ymm2, %ymm2 +; BDVER2-NEXT: vfmaddps %ymm3, %ymm2, %ymm1, %ymm3 +; BDVER2-NEXT: vfnmaddps %ymm2, %ymm3, %ymm2, %ymm2 ; BDVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 ; BDVER2-NEXT: vfnmaddps %ymm5, %ymm4, %ymm1, %ymm1 ; BDVER2-NEXT: vfmaddps %ymm4, %ymm1, %ymm2, %ymm1 @@ -1645,17 +1645,17 @@ ; HASWELL-LABEL: v16f32_two_step2: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm2 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm3 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; HASWELL-NEXT: vmovaps %ymm2, %ymm4 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm4 = -(ymm0 * ymm4) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm4 = (ymm4 * ymm2) + ymm2 +; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm4 = (ymm0 * ymm4) + ymm3 +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 ; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; HASWELL-NEXT: vmulps %ymm2, %ymm4, %ymm5 ; HASWELL-NEXT: vrcpps %ymm1, %ymm6 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm0 = -(ymm5 * ymm0) + ymm2 ; HASWELL-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm5 -; HASWELL-NEXT: vfnmadd231ps {{.*#+}} ymm3 = -(ymm1 * ymm6) + ymm3 -; HASWELL-NEXT: vfmadd132ps {{.*#+}} ymm3 = (ymm3 * ymm6) + ymm6 +; HASWELL-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm1 * ymm6) + ymm3 +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm6) + ymm6 ; HASWELL-NEXT: vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; HASWELL-NEXT: vmulps %ymm2, %ymm3, %ymm4 ; HASWELL-NEXT: vfnmadd213ps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm2 @@ -1692,9 +1692,9 @@ ; AVX512-LABEL: v16f32_two_step2: ; AVX512: # %bb.0: ; AVX512-NEXT: vrcp14ps %zmm0, %zmm1 -; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 -; AVX512-NEXT: vfmadd132ps {{.*#+}} zmm2 = (zmm2 * zmm1) + zmm1 +; AVX512-NEXT: vbroadcastss {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2 +; AVX512-NEXT: vfnmadd132ps {{.*#+}} zmm2 = -(zmm2 * zmm1) + zmm1 ; AVX512-NEXT: vmovaps {{.*#+}} zmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; AVX512-NEXT: vmulps %zmm1, %zmm2, %zmm3 ; AVX512-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm3 * zmm0) + zmm1