diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1042,6 +1042,10 @@ return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op]; } + /// Return true if we want to legalize the node \p N as libcall without + /// querying the operation action. + virtual bool isLegalizedAsLibCall(SDNode *N) const { return false; } + /// Custom method defined by each target to indicate if an operation which /// may require a scale is supported natively by the target. /// If not, the operation is illegal. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -977,6 +977,9 @@ "Unexpected illegal type!"); #endif + if (TLI.isLegalizedAsLibCall(Node)) + return ConvertNodeToLibcall(Node); + // Figure out the correct action; the way to query this varies by opcode TargetLowering::LegalizeAction Action = TargetLowering::Legal; bool SimpleFinishLegalizing = true; @@ -4316,11 +4319,110 @@ Results.push_back(ExpandLibCall(LC, Node, false)); break; } + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: { + // TODO - Common the code with DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP + bool IsStrict = Node->isStrictFPOpcode(); + bool Signed = Node->getOpcode() == ISD::SINT_TO_FP || + Node->getOpcode() == ISD::STRICT_SINT_TO_FP; + EVT SVT = Node->getOperand(IsStrict ? 1 : 0).getValueType(); + EVT RVT = Node->getValueType(0); + EVT NVT = EVT(); + SDLoc dl(Node); + + // If the input is not legal, eg: i1 -> fp, then it needs to be promoted to + // a larger type, eg: i8 -> fp. Even if it is legal, no libcall may exactly + // match. Look for an appropriate libcall. + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE; + t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; + ++t) { + NVT = (MVT::SimpleValueType)t; + // The source needs to big enough to hold the operand. + if (NVT.bitsGE(SVT)) + LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT) + : RTLIB::getUINTTOFP(NVT, RVT); + } + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall"); + + SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue(); + // Sign/zero extend the argument if the libcall takes a larger type. + SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, + NVT, Node->getOperand(IsStrict ? 1 : 0)); + TargetLowering::MakeLibCallOptions CallOptions; + CallOptions.setSExt(Signed); + std::pair Tmp = TLI.makeLibCall( + DAG, LC, TLI.getTypeToTransformTo(*DAG.getContext(), RVT), Op, + CallOptions, dl, Chain); + Results.push_back(Tmp.first); + if (IsStrict) + Results.push_back(Tmp.second); + break; + } + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: { + // TODO - Common the code with DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT. + bool IsStrict = Node->isStrictFPOpcode(); + bool Signed = Node->getOpcode() == ISD::FP_TO_SINT || + Node->getOpcode() == ISD::STRICT_FP_TO_SINT; + + SDValue Op = Node->getOperand(IsStrict ? 1 : 0); + EVT SVT = Op.getValueType(); + EVT RVT = Node->getValueType(0); + EVT NVT = EVT(); + SDLoc dl(Node); + + // If the result is not legal, eg: fp -> i1, then it needs to be promoted to + // a larger type, eg: fp -> i32. Even if it is legal, no libcall may exactly + // match, eg. we don't have fp -> i8 conversions. + // Look for an appropriate libcall. + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE; + IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; + ++IntVT) { + NVT = (MVT::SimpleValueType)IntVT; + // The type needs to big enough to hold the result. + if (NVT.bitsGE(RVT)) + LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT) + : RTLIB::getFPTOUINT(SVT, NVT); + } + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall"); + + SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue(); + TargetLowering::MakeLibCallOptions CallOptions; + std::pair Tmp = + TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl, Chain); + + // Truncate the result if the libcall returns a larger type. + Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, RVT, Tmp.first)); + if (IsStrict) + Results.push_back(Tmp.second); + break; + } + + case ISD::STRICT_FP_EXTEND: + case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_TO_FP16: { - RTLIB::Libcall LC = - RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16); - assert(LC != RTLIB::UNKNOWN_LIBCALL && - "Unable to expand strict_fp_to_fp16"); + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + switch (Opc) { + default: + llvm_unreachable("Unable to legalize as libcall"); + case ISD::STRICT_FP_TO_FP16: + LC = RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16); + break; + case ISD::STRICT_FP_EXTEND: + LC = RTLIB::getFPEXT(Node->getOperand(1).getValueType(), + Node->getValueType(0)); + break; + case ISD::STRICT_FP_ROUND: + LC = RTLIB::getFPROUND(Node->getOperand(1).getValueType(), + Node->getValueType(0)); + break; + } TargetLowering::MakeLibCallOptions CallOptions; std::pair Tmp = TLI.makeLibCall(DAG, LC, Node->getValueType(0), Node->getOperand(1), @@ -4329,6 +4431,31 @@ Results.push_back(Tmp.second); break; } + case ISD::FP_EXTEND: { + Results.push_back( + ExpandLibCall(RTLIB::getFPEXT(Node->getOperand(0).getValueType(), + Node->getValueType(0)), + Node, false)); + break; + } + case ISD::FP_ROUND: { + // X = FP_ROUND(Y, TRUNC) + // TRUNC is a flag, which is always an integer that is zero or one. + // If TRUNC is 0, this is a normal rounding, if it is 1, this FP_ROUND + // is known to not change the value of Y. + // We can only expand it into libcall if the TRUNC is 0. + const ConstantSDNode *TRUNC = dyn_cast(Node->getOperand(1)); + assert(TRUNC && TRUNC->isNullValue() && + "Unable to expand as libcall if it is not normal rounding"); + TargetLowering::MakeLibCallOptions CallOptions; + std::pair Tmp = TLI.makeLibCall( + DAG, + RTLIB::getFPROUND(Node->getOperand(0).getValueType(), + Node->getValueType(0)), + Node->getValueType(0), Node->getOperand(0), CallOptions, SDLoc(Node)); + Results.push_back(Tmp.first); + break; + } case ISD::FSUB: case ISD::STRICT_FSUB: ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -530,6 +530,8 @@ bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override; + bool isLegalizedAsLibCall(SDNode *N) const override; + bool isTruncateFree(Type *Ty1, Type *Ty2) const override; bool isTruncateFree(EVT VT1, EVT VT2) const override; @@ -902,8 +904,6 @@ SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, - RTLIB::Libcall Call) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -368,12 +368,12 @@ // Virtually no operation on f128 is legal, but LLVM can't expand them when // there's a valid register class, so we need custom operations in most cases. setOperationAction(ISD::FABS, MVT::f128, Expand); - setOperationAction(ISD::FADD, MVT::f128, Custom); + setOperationAction(ISD::FADD, MVT::f128, LibCall); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); setOperationAction(ISD::FCOS, MVT::f128, Expand); - setOperationAction(ISD::FDIV, MVT::f128, Custom); + setOperationAction(ISD::FDIV, MVT::f128, LibCall); setOperationAction(ISD::FMA, MVT::f128, Expand); - setOperationAction(ISD::FMUL, MVT::f128, Custom); + setOperationAction(ISD::FMUL, MVT::f128, LibCall); setOperationAction(ISD::FNEG, MVT::f128, Expand); setOperationAction(ISD::FPOW, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); @@ -381,7 +381,7 @@ setOperationAction(ISD::FSIN, MVT::f128, Expand); setOperationAction(ISD::FSINCOS, MVT::f128, Expand); setOperationAction(ISD::FSQRT, MVT::f128, Expand); - setOperationAction(ISD::FSUB, MVT::f128, Custom); + setOperationAction(ISD::FSUB, MVT::f128, LibCall); setOperationAction(ISD::FTRUNC, MVT::f128, Expand); setOperationAction(ISD::SETCC, MVT::f128, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); @@ -389,7 +389,6 @@ setOperationAction(ISD::BR_CC, MVT::f128, Custom); setOperationAction(ISD::SELECT, MVT::f128, Custom); setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); // Lowering for many of the conversions is actually specified by the non-f128 // type. The LowerXXX function will be trivial when f128 isn't involved. @@ -1211,6 +1210,27 @@ PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } +bool AArch64TargetLowering::isLegalizedAsLibCall(SDNode *N) const { + switch (N->getOpcode()) { + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + case ISD::STRICT_FP_EXTEND: + case ISD::FP_EXTEND: + return N->getValueType(0) == MVT::f128; + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + case ISD::FP_ROUND: + return N->getOperand(0).getValueType() == MVT::f128; + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_FP_ROUND: + return N->getOperand(1).getValueType() == MVT::f128; + } + return false; +} + void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { assert(VT.isVector() && "VT should be a vector type"); @@ -2826,20 +2846,6 @@ return std::make_pair(Value, Overflow); } -SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, - RTLIB::Libcall Call) const { - bool IsStrict = Op->isStrictFPOpcode(); - unsigned Offset = IsStrict ? 1 : 0; - SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); - SmallVector Ops(Op->op_begin() + Offset, Op->op_end()); - MakeLibCallOptions CallOptions; - SDValue Result; - SDLoc dl(Op); - std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops, - CallOptions, dl, Chain); - return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; -} - SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { if (useSVEForFixedLengthVectorVT(Op.getValueType())) return LowerToScalableOp(Op, DAG); @@ -3021,12 +3027,8 @@ if (Op.getValueType().isScalableVector()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); - assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); - - RTLIB::Libcall LC; - LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); - - return LowerF128Call(Op, DAG, LC); + llvm_unreachable("Unexpected lowering"); + return SDValue(); } SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, @@ -3038,28 +3040,11 @@ SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); EVT SrcVT = SrcVal.getValueType(); - if (SrcVT != MVT::f128) { - // Expand cases where the input is a vector bigger than NEON. - if (useSVEForFixedLengthVectorVT(SrcVT)) - return SDValue(); - - // It's legal except when f128 is involved - return Op; - } - - RTLIB::Libcall LC; - LC = RTLIB::getFPROUND(SrcVT, Op.getValueType()); + // Expand cases where the input is a vector bigger than NEON. + if (useSVEForFixedLengthVectorVT(SrcVT)) + return SDValue(); - // FP_ROUND node has a second operand indicating whether it is known to be - // precise. That doesn't take part in the LibCall so we can't directly use - // LowerF128Call. - MakeLibCallOptions CallOptions; - SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); - SDValue Result; - SDLoc dl(Op); - std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, - CallOptions, dl, Chain); - return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; + return Op; } SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, @@ -3129,19 +3114,7 @@ DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal)); } - if (SrcVal.getValueType() != MVT::f128) { - // It's legal except when f128 is involved - return Op; - } - - RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::FP_TO_SINT || - Op.getOpcode() == ISD::STRICT_FP_TO_SINT) - LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType()); - else - LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType()); - - return LowerF128Call(Op, DAG, LC); + return Op; } SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, @@ -3205,19 +3178,7 @@ if (SrcVal.getValueType() == MVT::i128) return SDValue(); - // Other conversions are legal, unless it's to the completely software-based - // fp128. - if (Op.getValueType() != MVT::f128) - return Op; - - RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::SINT_TO_FP || - Op.getOpcode() == ISD::STRICT_SINT_TO_FP) - LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType()); - else - LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType()); - - return LowerF128Call(Op, DAG, LC); + return Op; } SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, @@ -4018,22 +3979,14 @@ case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::FADD: - if (Op.getValueType() == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::ADD_F128); return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); case ISD::FSUB: - if (Op.getValueType() == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::SUB_F128); return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); case ISD::FMUL: - if (Op.getValueType() == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::MUL_F128); return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); case ISD::FMA: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); case ISD::FDIV: - if (Op.getValueType() == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::DIV_F128); return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); case ISD::FNEG: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -908,6 +908,8 @@ /// the immediate into a register. bool isLegalAddImmediate(int64_t Imm) const override; + bool isLegalizedAsLibCall(SDNode *N) const override; + /// isTruncateFree - Return true if it's free to truncate a value of /// type Ty1 to type Ty2. e.g. On PPC it's free to truncate a i64 value in /// register X1 to i32 by referencing its sub-register R1. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -610,6 +610,9 @@ setCondCodeAction(ISD::SETONE, MVT::f32, Expand); setCondCodeAction(ISD::SETONE, MVT::f64, Expand); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + if (Subtarget.has64BitSupport()) { // They also have instructions for converting between i64 and fp. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); @@ -1206,6 +1209,9 @@ setOperationAction(ISD::FSQRT, MVT::f128, Expand); setOperationAction(ISD::FMA, MVT::f128, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); + + setTruncStoreAction(MVT::f128, MVT::f64, Expand); + setTruncStoreAction(MVT::f128, MVT::f32, Expand); } if (Subtarget.hasP9Altivec()) { @@ -1388,6 +1394,32 @@ PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive(); } +bool PPCTargetLowering::isLegalizedAsLibCall(SDNode *N) const { + if (Subtarget.hasP9Vector()) + return false; + + unsigned SrcIdx = N->isStrictFPOpcode() ? 1 : 0; + EVT DestVT = N->getValueType(0); + switch (N->getOpcode()) { + case ISD::STRICT_FP_ROUND: + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: + case ISD::FP_ROUND: + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return N->getOperand(SrcIdx).getValueType() == MVT::f128; + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + case ISD::STRICT_FP_EXTEND: + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + case ISD::FP_EXTEND: + return DestVT == MVT::f128; + } + + return false; +} + /// getMaxByValAlign - Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) { diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1013,6 +1013,8 @@ bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override; + bool isLegalizedAsLibCall(SDNode *N) const override; + bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const override { @@ -1521,10 +1523,6 @@ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; - - SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, - RTLIB::Libcall Call) const; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -717,22 +717,20 @@ setOperationAction(ISD::FSQRT, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); - setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom); - // We need to custom handle any FP_ROUND with an f128 input, but - // LegalizeDAG uses the result type to know when to run a custom handler. - // So we have to list all legal floating point result types here. + setOperationAction(ISD::FP_EXTEND, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, LibCall); + if (isTypeLegal(MVT::f32)) { - setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); } if (isTypeLegal(MVT::f64)) { - setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f64, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); } if (isTypeLegal(MVT::f80)) { - setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); - setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f80, Legal); + setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); } setOperationAction(ISD::SETCC, MVT::f128, Custom); @@ -2044,6 +2042,39 @@ IsStrictFPEnabled = true; } +bool X86TargetLowering::isLegalizedAsLibCall(SDNode *N) const { + unsigned SrcIdx = N->isStrictFPOpcode() ? 1 : 0; + EVT DestVT = N->getValueType(0); + + switch (N->getOpcode()) { + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: { + // Don't legalize it as libcall if the SrcVT can be promoted. + EVT SrcVT = N->getOperand(SrcIdx).getValueType(); + return DestVT == MVT::f128 && SrcVT != MVT::i16 && + getOperationAction(N->getOpcode(), SrcVT) != Promote; + } + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: { + // Don't legalize it as libcall if the DestVT can be promoted. + EVT SrcVT = N->getOperand(SrcIdx).getValueType(); + return SrcVT == MVT::f128 && DestVT != MVT::i16 && + getOperationAction(N->getOpcode(), DestVT) != Promote; + } + case ISD::FP_EXTEND: + case ISD::STRICT_FP_EXTEND: + return DestVT == MVT::f128; + case ISD::FP_ROUND: + case ISD::STRICT_FP_ROUND: + return N->getOperand(SrcIdx).getValueType() == MVT::f128; + } + return false; +} + // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); @@ -19792,9 +19823,6 @@ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext); } - if (VT == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); - SDValue ValueToStore = Src; if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit()) // Bitcasting to f64 here allows us to do a single 64-bit store from @@ -20233,9 +20261,6 @@ MVT DstVT = Op->getSimpleValueType(0); SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); - if (DstVT == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); - if (DstVT.isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); @@ -21220,25 +21245,6 @@ if (UseSSEReg && IsSigned) return Op; - // fp128 needs to use a libcall. - if (SrcVT == MVT::f128) { - RTLIB::Libcall LC; - if (IsSigned) - LC = RTLIB::getFPTOSINT(SrcVT, VT); - else - LC = RTLIB::getFPTOUINT(SrcVT, VT); - - SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); - MakeLibCallOptions CallOptions; - std::pair Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions, - SDLoc(Op), Chain); - - if (IsStrict) - return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); - - return Tmp.first; - } - // Fall back to X87. SDValue Chain; if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) { @@ -21314,11 +21320,6 @@ SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); - if (VT == MVT::f128) { - RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT); - return LowerF128Call(Op, DAG, LC); - } - assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); SDValue Res = @@ -21329,35 +21330,6 @@ return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); } -SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - bool IsStrict = Op->isStrictFPOpcode(); - - MVT VT = Op.getSimpleValueType(); - SDValue In = Op.getOperand(IsStrict ? 1 : 0); - MVT SVT = In.getSimpleValueType(); - - // It's legal except when f128 is involved - if (SVT != MVT::f128) - return Op; - - RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT); - - // FP_ROUND node has a second operand indicating whether it is known to be - // precise. That doesn't take part in the LibCall so we can't directly use - // LowerF128Call. - - SDLoc dl(Op); - SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); - MakeLibCallOptions CallOptions; - std::pair Tmp = makeLibCall(DAG, LC, VT, In, CallOptions, - dl, Chain); - - if (IsStrict) - return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); - - return Tmp.first; -} - static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) { bool IsStrict = Op->isStrictFPOpcode(); SDValue Src = Op.getOperand(IsStrict ? 1 : 0); @@ -29658,25 +29630,6 @@ return NOOP; } -SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, - RTLIB::Libcall Call) const { - - bool IsStrict = Op->isStrictFPOpcode(); - unsigned Offset = IsStrict ? 1 : 0; - SmallVector Ops(Op->op_begin() + Offset, Op->op_end()); - - SDLoc dl(Op); - SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); - MakeLibCallOptions CallOptions; - std::pair Tmp = makeLibCall(DAG, Call, MVT::f128, Ops, - CallOptions, dl, Chain); - - if (IsStrict) - return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); - - return Tmp.first; -} - // Custom split CVTPS2PH with wide types. static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); @@ -29743,8 +29696,6 @@ case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); - case ISD::FP_ROUND: - case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP16_TO_FP: case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG); case ISD::FP_TO_FP16: diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll --- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll @@ -7,16 +7,11 @@ define fp128 @test_add() { ; CHECK-LABEL: test_add: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, lhs ; CHECK-NEXT: ldr q0, [x8, :lo12:lhs] ; CHECK-NEXT: adrp x8, rhs ; CHECK-NEXT: ldr q1, [x8, :lo12:rhs] -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __addtf3 %lhs = load fp128, fp128* @lhs, align 16 %rhs = load fp128, fp128* @rhs, align 16 @@ -28,16 +23,11 @@ define fp128 @test_sub() { ; CHECK-LABEL: test_sub: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, lhs ; CHECK-NEXT: ldr q0, [x8, :lo12:lhs] ; CHECK-NEXT: adrp x8, rhs ; CHECK-NEXT: ldr q1, [x8, :lo12:rhs] -; CHECK-NEXT: bl __subtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __subtf3 %lhs = load fp128, fp128* @lhs, align 16 %rhs = load fp128, fp128* @rhs, align 16 @@ -49,16 +39,11 @@ define fp128 @test_mul() { ; CHECK-LABEL: test_mul: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, lhs ; CHECK-NEXT: ldr q0, [x8, :lo12:lhs] ; CHECK-NEXT: adrp x8, rhs ; CHECK-NEXT: ldr q1, [x8, :lo12:rhs] -; CHECK-NEXT: bl __multf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __multf3 %lhs = load fp128, fp128* @lhs, align 16 %rhs = load fp128, fp128* @rhs, align 16 @@ -70,16 +55,11 @@ define fp128 @test_div() { ; CHECK-LABEL: test_div: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, lhs ; CHECK-NEXT: ldr q0, [x8, :lo12:lhs] ; CHECK-NEXT: adrp x8, rhs ; CHECK-NEXT: ldr q1, [x8, :lo12:rhs] -; CHECK-NEXT: bl __divtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __divtf3 %lhs = load fp128, fp128* @lhs, align 16 %rhs = load fp128, fp128* @rhs, align 16 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll @@ -73,13 +73,10 @@ define fp128 @test_v1f128(<1 x fp128> %a, fp128 %s) nounwind { ; CHECK-LABEL: test_v1f128: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: mov v2.16b, v0.16b ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: mov v1.16b, v2.16b -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __addtf3 %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 %s, <1 x fp128> %a) ret fp128 %b } @@ -151,10 +148,9 @@ ; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #32 // =32 -; CHECK-NEXT: ret +; CHECK-NEXT: b __addtf3 %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 %s, <2 x fp128> %a) ret fp128 %b } @@ -162,10 +158,7 @@ define fp128 @test_v2f128_neutral(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128_neutral: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __addtf3 %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 0xL00000000000000008000000000000000, <2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -84,10 +84,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __addtf3 %b = call reassoc fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 0xL00000000000000008000000000000000, <2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll @@ -59,10 +59,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl __multf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __multf3 %b = call fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/PowerPC/f128-conv.ll b/llvm/test/CodeGen/PowerPC/f128-conv.ll --- a/llvm/test/CodeGen/PowerPC/f128-conv.ll +++ b/llvm/test/CodeGen/PowerPC/f128-conv.ll @@ -4,7 +4,7 @@ ; RUN: | FileCheck %s ; RUN: llc -relocation-model=pic -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \ ; RUN: -ppc-vsr-nums-as-vr -verify-machineinstrs -ppc-asm-full-reg-names < %s \ -; RUN: | FileCheck %s -check-prefix=CHECK-P8 +; RUN: -enable-soft-fp128 | FileCheck %s -check-prefix=CHECK-P8 @mem = global [5 x i64] [i64 56, i64 63, i64 3, i64 5, i64 6], align 8 @umem = global [5 x i64] [i64 560, i64 100, i64 34, i64 2, i64 5], align 8 @@ -35,8 +35,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -86,8 +85,7 @@ ; CHECK-P8-NEXT: mr r4, r5 ; CHECK-P8-NEXT: bl __floattitf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -127,8 +125,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -166,8 +163,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -204,12 +200,13 @@ ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) ; CHECK-P8-NEXT: mr r30, r3 -; CHECK-P8-NEXT: clrldi r3, r4, 63 -; CHECK-P8-NEXT: neg r3, r3 +; CHECK-P8-NEXT: andi. r3, r4, 1 +; CHECK-P8-NEXT: li r4, -1 +; CHECK-P8-NEXT: li r3, 0 +; CHECK-P8-NEXT: iselgt r3, r4, r3 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -244,8 +241,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -295,8 +291,7 @@ ; CHECK-P8-NEXT: mr r4, r5 ; CHECK-P8-NEXT: bl __floatuntitf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -336,8 +331,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -375,8 +369,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -413,8 +406,7 @@ ; CHECK-P8-NEXT: clrldi r3, r4, 63 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -454,10 +446,8 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: mr r5, r3 ; CHECK-P8-NEXT: mr r3, r30 -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r5, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -501,10 +491,8 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: mr r5, r3 ; CHECK-P8-NEXT: mr r3, r30 -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r5, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -543,8 +531,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -580,8 +567,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -623,8 +609,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -661,8 +646,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -698,8 +682,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -741,8 +724,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -783,8 +765,7 @@ ; CHECK-P8-NEXT: clrldi r3, r3, 32 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -822,8 +803,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -855,12 +835,12 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) +; CHECK-P8-NEXT: lhz r4, 0(r4) ; CHECK-P8-NEXT: mr r30, r3 -; CHECK-P8-NEXT: lhz r3, 0(r4) +; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -895,14 +875,14 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) +; CHECK-P8-NEXT: addis r4, r2, .LC4@toc@ha ; CHECK-P8-NEXT: mr r30, r3 -; CHECK-P8-NEXT: addis r3, r2, .LC4@toc@ha -; CHECK-P8-NEXT: ld r3, .LC4@toc@l(r3) -; CHECK-P8-NEXT: lhz r3, 6(r3) +; CHECK-P8-NEXT: ld r4, .LC4@toc@l(r4) +; CHECK-P8-NEXT: lhz r4, 6(r4) +; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -943,8 +923,7 @@ ; CHECK-P8-NEXT: clrldi r3, r3, 32 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -984,8 +963,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1016,12 +994,12 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) +; CHECK-P8-NEXT: lbz r4, 0(r4) ; CHECK-P8-NEXT: mr r30, r3 -; CHECK-P8-NEXT: lbz r3, 0(r4) +; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1056,14 +1034,14 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) +; CHECK-P8-NEXT: addis r4, r2, .LC5@toc@ha ; CHECK-P8-NEXT: mr r30, r3 -; CHECK-P8-NEXT: addis r3, r2, .LC5@toc@ha -; CHECK-P8-NEXT: ld r3, .LC5@toc@l(r3) -; CHECK-P8-NEXT: lbz r3, 2(r3) +; CHECK-P8-NEXT: ld r4, .LC5@toc@l(r4) +; CHECK-P8-NEXT: lbz r4, 2(r4) +; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1104,8 +1082,7 @@ ; CHECK-P8-NEXT: clrldi r3, r3, 32 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1148,9 +1125,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __trunckfdf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -1186,9 +1161,7 @@ ; CHECK-P8-NEXT: addis r4, r2, .LC6@toc@ha ; CHECK-P8-NEXT: mr r30, r3 ; CHECK-P8-NEXT: ld r4, .LC6@toc@l(r4) -; CHECK-P8-NEXT: ld r5, 0(r4) -; CHECK-P8-NEXT: ld r4, 8(r4) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r4 ; CHECK-P8-NEXT: bl __trunckfdf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: stfdx f1, 0, r30 @@ -1231,9 +1204,7 @@ ; CHECK-P8-NEXT: addis r4, r2, .LC7@toc@ha ; CHECK-P8-NEXT: mr r29, r3 ; CHECK-P8-NEXT: ld r4, .LC7@toc@l(r4) -; CHECK-P8-NEXT: ld r5, 0(r4) -; CHECK-P8-NEXT: ld r4, 8(r4) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r4 ; CHECK-P8-NEXT: bl __trunckfdf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: sldi r3, r30, 3 @@ -1273,14 +1244,9 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r9, 0(r3) -; CHECK-P8-NEXT: ld r7, 8(r3) -; CHECK-P8-NEXT: ld r8, 0(r4) -; CHECK-P8-NEXT: ld r6, 8(r4) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: mr r30, r5 -; CHECK-P8-NEXT: mr r3, r9 -; CHECK-P8-NEXT: mr r4, r7 -; CHECK-P8-NEXT: mr r5, r8 ; CHECK-P8-NEXT: bl __addkf3 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: bl __trunckfdf2 @@ -1318,9 +1284,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __trunckfsf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -1357,9 +1321,7 @@ ; CHECK-P8-NEXT: addis r4, r2, .LC6@toc@ha ; CHECK-P8-NEXT: mr r30, r3 ; CHECK-P8-NEXT: ld r4, .LC6@toc@l(r4) -; CHECK-P8-NEXT: ld r5, 0(r4) -; CHECK-P8-NEXT: ld r4, 8(r4) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r4 ; CHECK-P8-NEXT: bl __trunckfsf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: stfsx f1, 0, r30 @@ -1403,9 +1365,8 @@ ; CHECK-P8-NEXT: addis r4, r2, .LC7@toc@ha ; CHECK-P8-NEXT: mr r29, r3 ; CHECK-P8-NEXT: ld r4, .LC7@toc@l(r4) -; CHECK-P8-NEXT: ld r5, 48(r4) -; CHECK-P8-NEXT: ld r4, 56(r4) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: addi r4, r4, 48 +; CHECK-P8-NEXT: lvx v2, 0, r4 ; CHECK-P8-NEXT: bl __trunckfsf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: sldi r3, r30, 2 @@ -1446,14 +1407,9 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r9, 0(r3) -; CHECK-P8-NEXT: ld r7, 8(r3) -; CHECK-P8-NEXT: ld r8, 0(r4) -; CHECK-P8-NEXT: ld r6, 8(r4) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: mr r30, r5 -; CHECK-P8-NEXT: mr r3, r9 -; CHECK-P8-NEXT: mr r4, r7 -; CHECK-P8-NEXT: mr r5, r8 ; CHECK-P8-NEXT: bl __addkf3 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: bl __trunckfsf2 @@ -1522,10 +1478,9 @@ ; CHECK-P8-NEXT: lfdx f1, 0, r3 ; CHECK-P8-NEXT: bl __extenddfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: addis r5, r2, .LC8@toc@ha -; CHECK-P8-NEXT: ld r5, .LC8@toc@l(r5) -; CHECK-P8-NEXT: std r4, 8(r5) -; CHECK-P8-NEXT: std r3, 0(r5) +; CHECK-P8-NEXT: addis r3, r2, .LC8@toc@ha +; CHECK-P8-NEXT: ld r3, .LC8@toc@l(r3) +; CHECK-P8-NEXT: stvx v2, 0, r3 ; CHECK-P8-NEXT: addi r1, r1, 32 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: mtlr r0 @@ -1560,10 +1515,9 @@ ; CHECK-P8-NEXT: lfdx f1, r3, r4 ; CHECK-P8-NEXT: bl __extenddfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: addis r5, r2, .LC8@toc@ha -; CHECK-P8-NEXT: ld r5, .LC8@toc@l(r5) -; CHECK-P8-NEXT: std r4, 8(r5) -; CHECK-P8-NEXT: std r3, 0(r5) +; CHECK-P8-NEXT: addis r3, r2, .LC8@toc@ha +; CHECK-P8-NEXT: ld r3, .LC8@toc@l(r3) +; CHECK-P8-NEXT: stvx v2, 0, r3 ; CHECK-P8-NEXT: addi r1, r1, 32 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: mtlr r0 @@ -1602,9 +1556,8 @@ ; CHECK-P8-NEXT: mr r29, r3 ; CHECK-P8-NEXT: bl __extenddfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: sldi r5, r30, 4 -; CHECK-P8-NEXT: stdux r3, r29, r5 -; CHECK-P8-NEXT: std r4, 8(r29) +; CHECK-P8-NEXT: sldi r3, r30, 4 +; CHECK-P8-NEXT: stvx v2, r29, r3 ; CHECK-P8-NEXT: addi r1, r1, 64 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1640,8 +1593,7 @@ ; CHECK-P8-NEXT: mr r30, r4 ; CHECK-P8-NEXT: bl __extenddfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1700,10 +1652,9 @@ ; CHECK-P8-NEXT: lfsx f1, 0, r3 ; CHECK-P8-NEXT: bl __extendsfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: addis r5, r2, .LC8@toc@ha -; CHECK-P8-NEXT: ld r5, .LC8@toc@l(r5) -; CHECK-P8-NEXT: std r4, 8(r5) -; CHECK-P8-NEXT: std r3, 0(r5) +; CHECK-P8-NEXT: addis r3, r2, .LC8@toc@ha +; CHECK-P8-NEXT: ld r3, .LC8@toc@l(r3) +; CHECK-P8-NEXT: stvx v2, 0, r3 ; CHECK-P8-NEXT: addi r1, r1, 32 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: mtlr r0 @@ -1738,10 +1689,9 @@ ; CHECK-P8-NEXT: lfsx f1, r3, r4 ; CHECK-P8-NEXT: bl __extendsfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: addis r5, r2, .LC8@toc@ha -; CHECK-P8-NEXT: ld r5, .LC8@toc@l(r5) -; CHECK-P8-NEXT: std r4, 8(r5) -; CHECK-P8-NEXT: std r3, 0(r5) +; CHECK-P8-NEXT: addis r3, r2, .LC8@toc@ha +; CHECK-P8-NEXT: ld r3, .LC8@toc@l(r3) +; CHECK-P8-NEXT: stvx v2, 0, r3 ; CHECK-P8-NEXT: addi r1, r1, 32 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: mtlr r0 @@ -1780,9 +1730,8 @@ ; CHECK-P8-NEXT: mr r29, r3 ; CHECK-P8-NEXT: bl __extendsfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: sldi r5, r30, 4 -; CHECK-P8-NEXT: stdux r3, r29, r5 -; CHECK-P8-NEXT: std r4, 8(r29) +; CHECK-P8-NEXT: sldi r3, r30, 4 +; CHECK-P8-NEXT: stvx v2, r29, r3 ; CHECK-P8-NEXT: addi r1, r1, 64 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1818,8 +1767,7 @@ ; CHECK-P8-NEXT: mr r30, r4 ; CHECK-P8-NEXT: bl __extendsfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1857,8 +1805,7 @@ ; CHECK-P8-NEXT: extsw r3, r3 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1894,8 +1841,7 @@ ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1933,8 +1879,7 @@ ; CHECK-P8-NEXT: extsw r3, r3 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1970,8 +1915,7 @@ ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -2009,8 +1953,7 @@ ; CHECK-P8-NEXT: clrldi r3, r3, 32 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -2046,8 +1989,7 @@ ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -2085,8 +2027,7 @@ ; CHECK-P8-NEXT: clrldi r3, r3, 32 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -2122,8 +2063,7 @@ ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -2160,9 +2100,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __fixtfti ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -2199,9 +2137,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __fixunstfti ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -2230,9 +2166,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __fixunskfsi ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -2261,9 +2195,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __fixkfsi ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 diff --git a/llvm/test/CodeGen/PowerPC/f128-rounding.ll b/llvm/test/CodeGen/PowerPC/f128-rounding.ll --- a/llvm/test/CodeGen/PowerPC/f128-rounding.ll +++ b/llvm/test/CodeGen/PowerPC/f128-rounding.ll @@ -2,7 +2,7 @@ ; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs \ ; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s ; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs \ -; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s -enable-soft-fp128 | FileCheck %s \ ; RUN: -check-prefix=CHECK-P8 define void @qp_trunc(fp128* nocapture readonly %a, fp128* nocapture %res) { @@ -22,15 +22,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl truncl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -61,15 +57,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl rintl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -100,15 +92,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl nearbyintl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -139,15 +127,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl roundl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -178,15 +162,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl floorl ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -217,15 +197,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl ceill ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll b/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll --- a/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll +++ b/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll @@ -6,7 +6,7 @@ ; RUN: < %s -mtriple=powerpc64le-unknown-linux -mcpu=pwr9 | FileCheck %s \ ; RUN: -check-prefix=P9 ; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ -; RUN: < %s -mtriple=powerpc64le-unknown-linux -mcpu=pwr8 -mattr=-vsx \ +; RUN: < %s -mtriple=powerpc64le-unknown-linux -mcpu=pwr8 -enable-soft-fp128 -mattr=-vsx \ ; RUN: | FileCheck %s -check-prefix=NOVSX declare i1 @llvm.experimental.constrained.fptosi.i1.f128(fp128, metadata) diff --git a/llvm/test/CodeGen/X86/fp128-load.ll b/llvm/test/CodeGen/X86/fp128-load.ll --- a/llvm/test/CodeGen/X86/fp128-load.ll +++ b/llvm/test/CodeGen/X86/fp128-load.ll @@ -22,14 +22,9 @@ define fp128 @TestLoadExtend(fp128 %x, i32 %n) { ; CHECK-LABEL: TestLoadExtend: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: movslq %edi, %rax ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq __extendsftf2 -; CHECK-NEXT: popq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq +; CHECK-NEXT: jmp __extendsftf2 # TAILCALL entry: %idxprom = sext i32 %n to i64 %arrayidx = getelementptr inbounds [2 x float], [2 x float]* @TestLoadExtend.data, i64 0, i64 %idxprom