diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1812,6 +1812,19 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, const SDLoc &dl, SDValue Chain) { + unsigned SrcSize = SrcOp.getValueSizeInBits(); + unsigned SlotSize = SlotVT.getSizeInBits(); + unsigned DestSize = DestVT.getSizeInBits(); + Type *DestType = DestVT.getTypeForEVT(*DAG.getContext()); + Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType); + + // Don't convert with stack if the load/store is expensive. + if ((SrcSize > SlotSize && + !TLI.isTruncStoreLegalOrCustom(SrcOp.getValueType(), SlotVT)) || + (SlotSize < DestSize && + !TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, DestVT, SlotVT))) + return SDValue(); + // Create the stack frame object. Align SrcAlign = DAG.getDataLayout().getPrefTypeAlign( SrcOp.getValueType().getTypeForEVT(*DAG.getContext())); @@ -1822,12 +1835,6 @@ MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); - unsigned SrcSize = SrcOp.getValueSizeInBits(); - unsigned SlotSize = SlotVT.getSizeInBits(); - unsigned DestSize = DestVT.getSizeInBits(); - Type *DestType = DestVT.getTypeForEVT(*DAG.getContext()); - Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType); - // Emit a store to the stack slot. Use a truncstore if the input value is // later than DestVT. SDValue Store; @@ -2415,7 +2422,11 @@ // TODO: Should any fast-math-flags be set for the created nodes? LLVM_DEBUG(dbgs() << "Legalizing INT_TO_FP\n"); - if (SrcVT == MVT::i32 && TLI.isTypeLegal(MVT::f64)) { + if (SrcVT == MVT::i32 && TLI.isTypeLegal(MVT::f64) && + (DestVT.bitsLE(MVT::f64) || + TLI.isOperationLegal(Node->isStrictFPOpcode() ? ISD::STRICT_FP_EXTEND + : ISD::FP_EXTEND, + DestVT))) { LLVM_DEBUG(dbgs() << "32-bit [signed|unsigned] integer to float/double " "expansion\n"); @@ -2477,8 +2488,9 @@ } return Result; } - // Code below here assumes !isSigned without checking again. - assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet"); + + if (isSigned) + return SDValue(); // TODO: Generalize this for use with other types. if (((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) || @@ -2537,6 +2549,11 @@ return DAG.getSelect(dl, DestVT, SignBitTest, Slow, Fast); } + // Don't expand it if there isn't cheap fadd. + if (!TLI.isOperationLegalOrCustom( + Node->isStrictFPOpcode() ? ISD::STRICT_FADD : ISD::FADD, DestVT)) + return SDValue(); + // The following optimization is valid only if every value in SrcVT (when // treated as signed) is representable in DestVT. Check that the mantissa // size of DestVT is >= than the number of bits in SrcVT -1. @@ -2563,7 +2580,8 @@ // offset depending on the data type. uint64_t FF; switch (SrcVT.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Unsupported integer type!"); + default: + return SDValue(); case MVT::i8 : FF = 0x43800000ULL; break; // 2^8 (as a float) case MVT::i16: FF = 0x47800000ULL; break; // 2^16 (as a float) case MVT::i32: FF = 0x4F800000ULL; break; // 2^32 (as a float) @@ -3034,16 +3052,19 @@ break; // We fall back to use stack operation when the FP_ROUND operation // isn't available. - Tmp1 = EmitStackConvert(Node->getOperand(1), Node->getValueType(0), - Node->getValueType(0), dl, Node->getOperand(0)); - ReplaceNode(Node, Tmp1.getNode()); - LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_ROUND node\n"); - return true; + if ((Tmp1 = EmitStackConvert(Node->getOperand(1), Node->getValueType(0), + Node->getValueType(0), dl, + Node->getOperand(0)))) { + ReplaceNode(Node, Tmp1.getNode()); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_ROUND node\n"); + return true; + } + break; case ISD::FP_ROUND: case ISD::BITCAST: - Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0), - Node->getValueType(0), dl); - Results.push_back(Tmp1); + if ((Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0), + Node->getValueType(0), dl))) + Results.push_back(Tmp1); break; case ISD::STRICT_FP_EXTEND: // When strict mode is enforced we can't do expansion because it @@ -3058,17 +3079,19 @@ break; // We fall back to use stack operation when the FP_EXTEND operation // isn't available. - Tmp1 = EmitStackConvert(Node->getOperand(1), - Node->getOperand(1).getValueType(), - Node->getValueType(0), dl, Node->getOperand(0)); - ReplaceNode(Node, Tmp1.getNode()); - LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_EXTEND node\n"); - return true; + if ((Tmp1 = EmitStackConvert( + Node->getOperand(1), Node->getOperand(1).getValueType(), + Node->getValueType(0), dl, Node->getOperand(0)))) { + ReplaceNode(Node, Tmp1.getNode()); + LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_EXTEND node\n"); + return true; + } + break; case ISD::FP_EXTEND: - Tmp1 = EmitStackConvert(Node->getOperand(0), - Node->getOperand(0).getValueType(), - Node->getValueType(0), dl); - Results.push_back(Tmp1); + if ((Tmp1 = EmitStackConvert(Node->getOperand(0), + Node->getOperand(0).getValueType(), + Node->getValueType(0), dl))) + Results.push_back(Tmp1); break; case ISD::SIGN_EXTEND_INREG: { EVT ExtraVT = cast(Node->getOperand(1))->getVT(); @@ -3113,10 +3136,11 @@ LLVM_FALLTHROUGH; case ISD::SINT_TO_FP: case ISD::STRICT_SINT_TO_FP: - Tmp1 = ExpandLegalINT_TO_FP(Node, Tmp2); - Results.push_back(Tmp1); - if (Node->isStrictFPOpcode()) - Results.push_back(Tmp2); + if ((Tmp1 = ExpandLegalINT_TO_FP(Node, Tmp2))) { + Results.push_back(Tmp1); + if (Node->isStrictFPOpcode()) + Results.push_back(Tmp2); + } break; case ISD::FP_TO_SINT: if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG)) @@ -4001,6 +4025,8 @@ Results.push_back(Fadd.getValue(1)); break; } + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: case ISD::STRICT_LRINT: case ISD::STRICT_LLRINT: case ISD::STRICT_LROUND: @@ -4333,11 +4359,132 @@ Results.push_back(ExpandLibCall(LC, Node, false)); break; } + case ISD::STRICT_SINT_TO_FP: + case ISD::STRICT_UINT_TO_FP: + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: { + // TODO - Common the code with DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP + bool IsStrict = Node->isStrictFPOpcode(); + bool Signed = Node->getOpcode() == ISD::SINT_TO_FP || + Node->getOpcode() == ISD::STRICT_SINT_TO_FP; + EVT SVT = Node->getOperand(IsStrict ? 1 : 0).getValueType(); + EVT RVT = Node->getValueType(0); + EVT NVT = EVT(); + SDLoc dl(Node); + + // Even if the input is legal, no libcall may exactly match, eg. we don't + // have i1 -> fp conversions. So, it needs to be promoted to a larger type, + // eg: i13 -> fp. Then, look for an appropriate libcall. + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE; + t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; + ++t) { + NVT = (MVT::SimpleValueType)t; + // The source needs to big enough to hold the operand. + if (NVT.bitsGE(SVT)) + LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT) + : RTLIB::getUINTTOFP(NVT, RVT); + } + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall"); + + SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue(); + // Sign/zero extend the argument if the libcall takes a larger type. + SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, + NVT, Node->getOperand(IsStrict ? 1 : 0)); + TargetLowering::MakeLibCallOptions CallOptions; + CallOptions.setSExt(Signed); + std::pair Tmp = + TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, dl, Chain); + Results.push_back(Tmp.first); + if (IsStrict) + Results.push_back(Tmp.second); + break; + } + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: { + // TODO - Common the code with DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT. + bool IsStrict = Node->isStrictFPOpcode(); + bool Signed = Node->getOpcode() == ISD::FP_TO_SINT || + Node->getOpcode() == ISD::STRICT_FP_TO_SINT; + + SDValue Op = Node->getOperand(IsStrict ? 1 : 0); + EVT SVT = Op.getValueType(); + EVT RVT = Node->getValueType(0); + EVT NVT = EVT(); + SDLoc dl(Node); + + // Even if the result is legal, no libcall may exactly match, eg. we don't + // have fp -> i1 conversions. So, it needs to be promoted to a larger type, + // eg: fp -> i32. Then, look for an appropriate libcall. + RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL; + for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE; + IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; + ++IntVT) { + NVT = (MVT::SimpleValueType)IntVT; + // The type needs to big enough to hold the result. + if (NVT.bitsGE(RVT)) + LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT) + : RTLIB::getFPTOUINT(SVT, NVT); + } + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall"); + + SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue(); + TargetLowering::MakeLibCallOptions CallOptions; + std::pair Tmp = + TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl, Chain); + + // Truncate the result if the libcall returns a larger type. + Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, RVT, Tmp.first)); + if (IsStrict) + Results.push_back(Tmp.second); + break; + } + + case ISD::FP_ROUND: + case ISD::STRICT_FP_ROUND: { + // X = FP_ROUND(Y, TRUNC) + // TRUNC is a flag, which is always an integer that is zero or one. + // If TRUNC is 0, this is a normal rounding, if it is 1, this FP_ROUND + // is known to not change the value of Y. + // We can only expand it into libcall if the TRUNC is 0. + bool IsStrict = Node->isStrictFPOpcode(); + SDValue Op = Node->getOperand(IsStrict ? 1 : 0); + SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue(); + EVT VT = Node->getValueType(0); + const ConstantSDNode *Trunc = + cast(Node->getOperand(IsStrict ? 2 : 1)); + assert(Trunc->isNullValue() && + "Unable to expand as libcall if it is not normal rounding"); + + RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), VT); + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall"); + + TargetLowering::MakeLibCallOptions CallOptions; + std::pair Tmp = + TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, SDLoc(Node), Chain); + Results.push_back(Tmp.first); + if (IsStrict) + Results.push_back(Tmp.second); + break; + } + case ISD::FP_EXTEND: { + Results.push_back( + ExpandLibCall(RTLIB::getFPEXT(Node->getOperand(0).getValueType(), + Node->getValueType(0)), + Node, false)); + break; + } + case ISD::STRICT_FP_EXTEND: case ISD::STRICT_FP_TO_FP16: { RTLIB::Libcall LC = - RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16); - assert(LC != RTLIB::UNKNOWN_LIBCALL && - "Unable to expand strict_fp_to_fp16"); + Node->getOpcode() == ISD::STRICT_FP_TO_FP16 + ? RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16) + : RTLIB::getFPEXT(Node->getOperand(1).getValueType(), + Node->getValueType(0)); + assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall"); + TargetLowering::MakeLibCallOptions CallOptions; std::pair Tmp = TLI.makeLibCall(DAG, LC, Node->getValueType(0), Node->getOperand(1), diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6477,6 +6477,11 @@ return true; } + // Don't expand it if there isn't cheap fsub instruction. + if (!isOperationLegalOrCustom( + Node->isStrictFPOpcode() ? ISD::STRICT_FSUB : ISD::FSUB, SrcVT)) + return false; + SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT); SDValue Sel; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -907,8 +907,6 @@ SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, - RTLIB::Libcall Call) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -378,12 +378,12 @@ // Virtually no operation on f128 is legal, but LLVM can't expand them when // there's a valid register class, so we need custom operations in most cases. setOperationAction(ISD::FABS, MVT::f128, Expand); - setOperationAction(ISD::FADD, MVT::f128, Custom); + setOperationAction(ISD::FADD, MVT::f128, LibCall); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); setOperationAction(ISD::FCOS, MVT::f128, Expand); - setOperationAction(ISD::FDIV, MVT::f128, Custom); + setOperationAction(ISD::FDIV, MVT::f128, LibCall); setOperationAction(ISD::FMA, MVT::f128, Expand); - setOperationAction(ISD::FMUL, MVT::f128, Custom); + setOperationAction(ISD::FMUL, MVT::f128, LibCall); setOperationAction(ISD::FNEG, MVT::f128, Expand); setOperationAction(ISD::FPOW, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); @@ -391,7 +391,7 @@ setOperationAction(ISD::FSIN, MVT::f128, Expand); setOperationAction(ISD::FSINCOS, MVT::f128, Expand); setOperationAction(ISD::FSQRT, MVT::f128, Expand); - setOperationAction(ISD::FSUB, MVT::f128, Custom); + setOperationAction(ISD::FSUB, MVT::f128, LibCall); setOperationAction(ISD::FTRUNC, MVT::f128, Expand); setOperationAction(ISD::SETCC, MVT::f128, Custom); setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); @@ -2852,20 +2852,6 @@ return std::make_pair(Value, Overflow); } -SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, - RTLIB::Libcall Call) const { - bool IsStrict = Op->isStrictFPOpcode(); - unsigned Offset = IsStrict ? 1 : 0; - SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); - SmallVector Ops(Op->op_begin() + Offset, Op->op_end()); - MakeLibCallOptions CallOptions; - SDValue Result; - SDLoc dl(Op); - std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops, - CallOptions, dl, Chain); - return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; -} - SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const { if (useSVEForFixedLengthVectorVT(Op.getValueType())) return LowerToScalableOp(Op, DAG); @@ -3048,11 +3034,7 @@ return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU); assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); - - RTLIB::Libcall LC; - LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); - - return LowerF128Call(Op, DAG, LC); + return SDValue(); } SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, @@ -3073,19 +3055,7 @@ return Op; } - RTLIB::Libcall LC; - LC = RTLIB::getFPROUND(SrcVT, Op.getValueType()); - - // FP_ROUND node has a second operand indicating whether it is known to be - // precise. That doesn't take part in the LibCall so we can't directly use - // LowerF128Call. - MakeLibCallOptions CallOptions; - SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); - SDValue Result; - SDLoc dl(Op); - std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal, - CallOptions, dl, Chain); - return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; + return SDValue(); } SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, @@ -3160,14 +3130,7 @@ return Op; } - RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::FP_TO_SINT || - Op.getOpcode() == ISD::STRICT_FP_TO_SINT) - LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType()); - else - LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType()); - - return LowerF128Call(Op, DAG, LC); + return SDValue(); } SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, @@ -3235,15 +3198,7 @@ // fp128. if (Op.getValueType() != MVT::f128) return Op; - - RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::SINT_TO_FP || - Op.getOpcode() == ISD::STRICT_SINT_TO_FP) - LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType()); - else - LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType()); - - return LowerF128Call(Op, DAG, LC); + return SDValue(); } SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, @@ -4159,22 +4114,14 @@ case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::FADD: - if (Op.getValueType() == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::ADD_F128); return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED); case ISD::FSUB: - if (Op.getValueType() == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::SUB_F128); return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED); case ISD::FMUL: - if (Op.getValueType() == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::MUL_F128); return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED); case ISD::FMA: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED); case ISD::FDIV: - if (Op.getValueType() == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::DIV_F128); return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED); case ISD::FNEG: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1161,6 +1161,7 @@ SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -610,6 +610,9 @@ setCondCodeAction(ISD::SETONE, MVT::f32, Expand); setCondCodeAction(ISD::SETONE, MVT::f64, Expand); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); + if (Subtarget.has64BitSupport()) { // They also have instructions for converting between i64 and fp. setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); @@ -1184,8 +1187,11 @@ AddPromotedToType(ISD::LOAD, MVT::f128, MVT::v4i32); AddPromotedToType(ISD::STORE, MVT::f128, MVT::v4i32); - setOperationAction(ISD::FADD, MVT::f128, Expand); - setOperationAction(ISD::FSUB, MVT::f128, Expand); + // Set FADD/FSUB as libcall to avoid the legalizer to expand the + // fp_to_uint and int_to_fp. + setOperationAction(ISD::FADD, MVT::f128, LibCall); + setOperationAction(ISD::FSUB, MVT::f128, LibCall); + setOperationAction(ISD::FMUL, MVT::f128, Expand); setOperationAction(ISD::FDIV, MVT::f128, Expand); setOperationAction(ISD::FNEG, MVT::f128, Expand); @@ -1198,6 +1204,19 @@ setOperationAction(ISD::FSQRT, MVT::f128, Expand); setOperationAction(ISD::FMA, MVT::f128, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); + + setTruncStoreAction(MVT::f128, MVT::f64, Expand); + setTruncStoreAction(MVT::f128, MVT::f32, Expand); + + // Expand the fp_extend if the target type is fp128. + setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Expand); + + // Expand the fp_round if the source type is fp128. + for (MVT VT : {MVT::f32, MVT::f64}) { + setOperationAction(ISD::FP_ROUND, VT, Custom); + setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); + } } if (Subtarget.hasP9Altivec()) { @@ -8381,7 +8400,7 @@ // FP to INT conversions are legal for f128. if (SrcVT == MVT::f128) - return Op; + return Subtarget.hasP9Vector() ? Op : SDValue(); // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on // PPC (the libcall is not available). @@ -8734,7 +8753,7 @@ // Conversions to f128 are legal. if (Op.getValueType() == MVT::f128) - return Op; + return Subtarget.hasP9Vector() ? Op : SDValue(); // Don't handle ppc_fp128 here; let it be lowered to a libcall. if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64) @@ -10950,6 +10969,15 @@ } } +SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { + bool IsStrict = Op->isStrictFPOpcode(); + if (Op.getOperand(IsStrict ? 1 : 0).getValueType() == MVT::f128 && + !Subtarget.hasP9Vector()) + return SDValue(); + + return Op; +} + // Custom lowering for fpext vf32 to v2f64 SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { @@ -11086,6 +11114,9 @@ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ISD::STRICT_FP_ROUND: + case ISD::FP_ROUND: + return LowerFP_ROUND(Op, DAG); case ISD::ROTL: return LowerROTL(Op, DAG); // For counter-based loop handling. diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1524,9 +1524,6 @@ SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, - RTLIB::Libcall Call) const; - SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19842,7 +19842,7 @@ } if (VT == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT)); + return SDValue(); SDValue ValueToStore = Src; if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit()) @@ -20283,7 +20283,7 @@ SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); if (DstVT == MVT::f128) - return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT)); + return SDValue(); if (DstVT.isVector()) return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); @@ -21363,10 +21363,8 @@ SDValue In = Op.getOperand(IsStrict ? 1 : 0); MVT SVT = In.getSimpleValueType(); - if (VT == MVT::f128) { - RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT); - return LowerF128Call(Op, DAG, LC); - } + if (VT == MVT::f128) + return SDValue(); assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); @@ -21380,31 +21378,12 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { bool IsStrict = Op->isStrictFPOpcode(); - - MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(IsStrict ? 1 : 0); - MVT SVT = In.getSimpleValueType(); - // It's legal except when f128 is involved - if (SVT != MVT::f128) + if (In.getSimpleValueType() != MVT::f128) return Op; - RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT); - - // FP_ROUND node has a second operand indicating whether it is known to be - // precise. That doesn't take part in the LibCall so we can't directly use - // LowerF128Call. - - SDLoc dl(Op); - SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); - MakeLibCallOptions CallOptions; - std::pair Tmp = makeLibCall(DAG, LC, VT, In, CallOptions, - dl, Chain); - - if (IsStrict) - return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); - - return Tmp.first; + return SDValue(); } static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -29691,25 +29670,6 @@ return NOOP; } -SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, - RTLIB::Libcall Call) const { - - bool IsStrict = Op->isStrictFPOpcode(); - unsigned Offset = IsStrict ? 1 : 0; - SmallVector Ops(Op->op_begin() + Offset, Op->op_end()); - - SDLoc dl(Op); - SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); - MakeLibCallOptions CallOptions; - std::pair Tmp = makeLibCall(DAG, Call, MVT::f128, Ops, - CallOptions, dl, Chain); - - if (IsStrict) - return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); - - return Tmp.first; -} - // Custom split CVTPS2PH with wide types. static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll --- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll @@ -7,16 +7,11 @@ define fp128 @test_add() { ; CHECK-LABEL: test_add: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, lhs ; CHECK-NEXT: ldr q0, [x8, :lo12:lhs] ; CHECK-NEXT: adrp x8, rhs ; CHECK-NEXT: ldr q1, [x8, :lo12:rhs] -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __addtf3 %lhs = load fp128, fp128* @lhs, align 16 %rhs = load fp128, fp128* @rhs, align 16 @@ -28,16 +23,11 @@ define fp128 @test_sub() { ; CHECK-LABEL: test_sub: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, lhs ; CHECK-NEXT: ldr q0, [x8, :lo12:lhs] ; CHECK-NEXT: adrp x8, rhs ; CHECK-NEXT: ldr q1, [x8, :lo12:rhs] -; CHECK-NEXT: bl __subtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __subtf3 %lhs = load fp128, fp128* @lhs, align 16 %rhs = load fp128, fp128* @rhs, align 16 @@ -49,16 +39,11 @@ define fp128 @test_mul() { ; CHECK-LABEL: test_mul: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, lhs ; CHECK-NEXT: ldr q0, [x8, :lo12:lhs] ; CHECK-NEXT: adrp x8, rhs ; CHECK-NEXT: ldr q1, [x8, :lo12:rhs] -; CHECK-NEXT: bl __multf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __multf3 %lhs = load fp128, fp128* @lhs, align 16 %rhs = load fp128, fp128* @rhs, align 16 @@ -70,16 +55,11 @@ define fp128 @test_div() { ; CHECK-LABEL: test_div: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: adrp x8, lhs ; CHECK-NEXT: ldr q0, [x8, :lo12:lhs] ; CHECK-NEXT: adrp x8, rhs ; CHECK-NEXT: ldr q1, [x8, :lo12:rhs] -; CHECK-NEXT: bl __divtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __divtf3 %lhs = load fp128, fp128* @lhs, align 16 %rhs = load fp128, fp128* @rhs, align 16 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll @@ -73,13 +73,10 @@ define fp128 @test_v1f128(<1 x fp128> %a, fp128 %s) nounwind { ; CHECK-LABEL: test_v1f128: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: mov v2.16b, v0.16b ; CHECK-NEXT: mov v0.16b, v1.16b ; CHECK-NEXT: mov v1.16b, v2.16b -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __addtf3 %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 %s, <1 x fp128> %a) ret fp128 %b } @@ -151,10 +148,9 @@ ; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #32 // =32 -; CHECK-NEXT: ret +; CHECK-NEXT: b __addtf3 %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 %s, <2 x fp128> %a) ret fp128 %b } @@ -162,10 +158,7 @@ define fp128 @test_v2f128_neutral(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128_neutral: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __addtf3 %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 0xL00000000000000008000000000000000, <2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll @@ -84,10 +84,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __addtf3 %b = call reassoc fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 0xL00000000000000008000000000000000, <2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll @@ -59,10 +59,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: bl __multf3 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-NEXT: b __multf3 %b = call fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/PowerPC/f128-conv.ll b/llvm/test/CodeGen/PowerPC/f128-conv.ll --- a/llvm/test/CodeGen/PowerPC/f128-conv.ll +++ b/llvm/test/CodeGen/PowerPC/f128-conv.ll @@ -4,7 +4,7 @@ ; RUN: | FileCheck %s ; RUN: llc -relocation-model=pic -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \ ; RUN: -ppc-vsr-nums-as-vr -verify-machineinstrs -ppc-asm-full-reg-names < %s \ -; RUN: | FileCheck %s -check-prefix=CHECK-P8 +; RUN: -enable-soft-fp128 | FileCheck %s -check-prefix=CHECK-P8 @mem = global [5 x i64] [i64 56, i64 63, i64 3, i64 5, i64 6], align 8 @umem = global [5 x i64] [i64 560, i64 100, i64 34, i64 2, i64 5], align 8 @@ -35,8 +35,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -86,8 +85,7 @@ ; CHECK-P8-NEXT: mr r4, r5 ; CHECK-P8-NEXT: bl __floattikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -127,8 +125,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -166,8 +163,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -204,12 +200,13 @@ ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) ; CHECK-P8-NEXT: mr r30, r3 -; CHECK-P8-NEXT: clrldi r3, r4, 63 -; CHECK-P8-NEXT: neg r3, r3 +; CHECK-P8-NEXT: andi. r3, r4, 1 +; CHECK-P8-NEXT: li r4, -1 +; CHECK-P8-NEXT: li r3, 0 +; CHECK-P8-NEXT: iselgt r3, r4, r3 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -244,8 +241,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -295,8 +291,7 @@ ; CHECK-P8-NEXT: mr r4, r5 ; CHECK-P8-NEXT: bl __floatuntikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -336,8 +331,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -375,8 +369,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -411,10 +404,9 @@ ; CHECK-P8-NEXT: stdu r1, -48(r1) ; CHECK-P8-NEXT: mr r30, r3 ; CHECK-P8-NEXT: clrldi r3, r4, 63 -; CHECK-P8-NEXT: bl __floatunsikf +; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -454,10 +446,8 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: mr r5, r3 ; CHECK-P8-NEXT: mr r3, r30 -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r5, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -501,10 +491,8 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: mr r5, r3 ; CHECK-P8-NEXT: mr r3, r30 -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r5, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -543,8 +531,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -580,8 +567,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -623,8 +609,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -661,8 +646,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -698,8 +682,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -741,8 +724,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -783,8 +765,7 @@ ; CHECK-P8-NEXT: clrldi r3, r3, 32 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -822,8 +803,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -855,12 +835,12 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) +; CHECK-P8-NEXT: lhz r4, 0(r4) ; CHECK-P8-NEXT: mr r30, r3 -; CHECK-P8-NEXT: lhz r3, 0(r4) +; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -895,14 +875,14 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) +; CHECK-P8-NEXT: addis r4, r2, .LC4@toc@ha ; CHECK-P8-NEXT: mr r30, r3 -; CHECK-P8-NEXT: addis r3, r2, .LC4@toc@ha -; CHECK-P8-NEXT: ld r3, .LC4@toc@l(r3) -; CHECK-P8-NEXT: lhz r3, 6(r3) +; CHECK-P8-NEXT: ld r4, .LC4@toc@l(r4) +; CHECK-P8-NEXT: lhz r4, 6(r4) +; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -943,8 +923,7 @@ ; CHECK-P8-NEXT: clrldi r3, r3, 32 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -984,8 +963,7 @@ ; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1016,12 +994,12 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) +; CHECK-P8-NEXT: lbz r4, 0(r4) ; CHECK-P8-NEXT: mr r30, r3 -; CHECK-P8-NEXT: lbz r3, 0(r4) +; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1056,14 +1034,14 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) +; CHECK-P8-NEXT: addis r4, r2, .LC5@toc@ha ; CHECK-P8-NEXT: mr r30, r3 -; CHECK-P8-NEXT: addis r3, r2, .LC5@toc@ha -; CHECK-P8-NEXT: ld r3, .LC5@toc@l(r3) -; CHECK-P8-NEXT: lbz r3, 2(r3) +; CHECK-P8-NEXT: ld r4, .LC5@toc@l(r4) +; CHECK-P8-NEXT: lbz r4, 2(r4) +; CHECK-P8-NEXT: mr r3, r4 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1104,8 +1082,7 @@ ; CHECK-P8-NEXT: clrldi r3, r3, 32 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1148,9 +1125,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __trunckfdf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -1186,9 +1161,7 @@ ; CHECK-P8-NEXT: addis r4, r2, .LC6@toc@ha ; CHECK-P8-NEXT: mr r30, r3 ; CHECK-P8-NEXT: ld r4, .LC6@toc@l(r4) -; CHECK-P8-NEXT: ld r5, 0(r4) -; CHECK-P8-NEXT: ld r4, 8(r4) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r4 ; CHECK-P8-NEXT: bl __trunckfdf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: stfdx f1, 0, r30 @@ -1231,9 +1204,7 @@ ; CHECK-P8-NEXT: addis r4, r2, .LC7@toc@ha ; CHECK-P8-NEXT: mr r29, r3 ; CHECK-P8-NEXT: ld r4, .LC7@toc@l(r4) -; CHECK-P8-NEXT: ld r5, 0(r4) -; CHECK-P8-NEXT: ld r4, 8(r4) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r4 ; CHECK-P8-NEXT: bl __trunckfdf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: sldi r3, r30, 3 @@ -1273,14 +1244,9 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r9, 0(r3) -; CHECK-P8-NEXT: ld r7, 8(r3) -; CHECK-P8-NEXT: ld r8, 0(r4) -; CHECK-P8-NEXT: ld r6, 8(r4) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: mr r30, r5 -; CHECK-P8-NEXT: mr r3, r9 -; CHECK-P8-NEXT: mr r4, r7 -; CHECK-P8-NEXT: mr r5, r8 ; CHECK-P8-NEXT: bl __addkf3 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: bl __trunckfdf2 @@ -1318,9 +1284,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __trunckfsf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -1357,9 +1321,7 @@ ; CHECK-P8-NEXT: addis r4, r2, .LC6@toc@ha ; CHECK-P8-NEXT: mr r30, r3 ; CHECK-P8-NEXT: ld r4, .LC6@toc@l(r4) -; CHECK-P8-NEXT: ld r5, 0(r4) -; CHECK-P8-NEXT: ld r4, 8(r4) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r4 ; CHECK-P8-NEXT: bl __trunckfsf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: stfsx f1, 0, r30 @@ -1403,9 +1365,8 @@ ; CHECK-P8-NEXT: addis r4, r2, .LC7@toc@ha ; CHECK-P8-NEXT: mr r29, r3 ; CHECK-P8-NEXT: ld r4, .LC7@toc@l(r4) -; CHECK-P8-NEXT: ld r5, 48(r4) -; CHECK-P8-NEXT: ld r4, 56(r4) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: addi r4, r4, 48 +; CHECK-P8-NEXT: lvx v2, 0, r4 ; CHECK-P8-NEXT: bl __trunckfsf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: sldi r3, r30, 2 @@ -1446,14 +1407,9 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r9, 0(r3) -; CHECK-P8-NEXT: ld r7, 8(r3) -; CHECK-P8-NEXT: ld r8, 0(r4) -; CHECK-P8-NEXT: ld r6, 8(r4) +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: mr r30, r5 -; CHECK-P8-NEXT: mr r3, r9 -; CHECK-P8-NEXT: mr r4, r7 -; CHECK-P8-NEXT: mr r5, r8 ; CHECK-P8-NEXT: bl __addkf3 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: bl __trunckfsf2 @@ -1522,10 +1478,9 @@ ; CHECK-P8-NEXT: lfdx f1, 0, r3 ; CHECK-P8-NEXT: bl __extenddfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: addis r5, r2, .LC8@toc@ha -; CHECK-P8-NEXT: ld r5, .LC8@toc@l(r5) -; CHECK-P8-NEXT: std r4, 8(r5) -; CHECK-P8-NEXT: std r3, 0(r5) +; CHECK-P8-NEXT: addis r3, r2, .LC8@toc@ha +; CHECK-P8-NEXT: ld r3, .LC8@toc@l(r3) +; CHECK-P8-NEXT: stvx v2, 0, r3 ; CHECK-P8-NEXT: addi r1, r1, 32 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: mtlr r0 @@ -1560,10 +1515,9 @@ ; CHECK-P8-NEXT: lfdx f1, r3, r4 ; CHECK-P8-NEXT: bl __extenddfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: addis r5, r2, .LC8@toc@ha -; CHECK-P8-NEXT: ld r5, .LC8@toc@l(r5) -; CHECK-P8-NEXT: std r4, 8(r5) -; CHECK-P8-NEXT: std r3, 0(r5) +; CHECK-P8-NEXT: addis r3, r2, .LC8@toc@ha +; CHECK-P8-NEXT: ld r3, .LC8@toc@l(r3) +; CHECK-P8-NEXT: stvx v2, 0, r3 ; CHECK-P8-NEXT: addi r1, r1, 32 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: mtlr r0 @@ -1602,9 +1556,8 @@ ; CHECK-P8-NEXT: mr r29, r3 ; CHECK-P8-NEXT: bl __extenddfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: sldi r5, r30, 4 -; CHECK-P8-NEXT: stdux r3, r29, r5 -; CHECK-P8-NEXT: std r4, 8(r29) +; CHECK-P8-NEXT: sldi r3, r30, 4 +; CHECK-P8-NEXT: stvx v2, r29, r3 ; CHECK-P8-NEXT: addi r1, r1, 64 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1640,8 +1593,7 @@ ; CHECK-P8-NEXT: mr r30, r4 ; CHECK-P8-NEXT: bl __extenddfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1700,10 +1652,9 @@ ; CHECK-P8-NEXT: lfsx f1, 0, r3 ; CHECK-P8-NEXT: bl __extendsfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: addis r5, r2, .LC8@toc@ha -; CHECK-P8-NEXT: ld r5, .LC8@toc@l(r5) -; CHECK-P8-NEXT: std r4, 8(r5) -; CHECK-P8-NEXT: std r3, 0(r5) +; CHECK-P8-NEXT: addis r3, r2, .LC8@toc@ha +; CHECK-P8-NEXT: ld r3, .LC8@toc@l(r3) +; CHECK-P8-NEXT: stvx v2, 0, r3 ; CHECK-P8-NEXT: addi r1, r1, 32 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: mtlr r0 @@ -1738,10 +1689,9 @@ ; CHECK-P8-NEXT: lfsx f1, r3, r4 ; CHECK-P8-NEXT: bl __extendsfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: addis r5, r2, .LC8@toc@ha -; CHECK-P8-NEXT: ld r5, .LC8@toc@l(r5) -; CHECK-P8-NEXT: std r4, 8(r5) -; CHECK-P8-NEXT: std r3, 0(r5) +; CHECK-P8-NEXT: addis r3, r2, .LC8@toc@ha +; CHECK-P8-NEXT: ld r3, .LC8@toc@l(r3) +; CHECK-P8-NEXT: stvx v2, 0, r3 ; CHECK-P8-NEXT: addi r1, r1, 32 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: mtlr r0 @@ -1780,9 +1730,8 @@ ; CHECK-P8-NEXT: mr r29, r3 ; CHECK-P8-NEXT: bl __extendsfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: sldi r5, r30, 4 -; CHECK-P8-NEXT: stdux r3, r29, r5 -; CHECK-P8-NEXT: std r4, 8(r29) +; CHECK-P8-NEXT: sldi r3, r30, 4 +; CHECK-P8-NEXT: stvx v2, r29, r3 ; CHECK-P8-NEXT: addi r1, r1, 64 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1818,8 +1767,7 @@ ; CHECK-P8-NEXT: mr r30, r4 ; CHECK-P8-NEXT: bl __extendsfkf2 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1857,8 +1805,7 @@ ; CHECK-P8-NEXT: extsw r3, r3 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1894,8 +1841,7 @@ ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1933,8 +1879,7 @@ ; CHECK-P8-NEXT: extsw r3, r3 ; CHECK-P8-NEXT: bl __floatsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -1970,8 +1915,7 @@ ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: bl __floatdikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -2009,8 +1953,7 @@ ; CHECK-P8-NEXT: clrldi r3, r3, 32 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -2046,8 +1989,7 @@ ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -2085,8 +2027,7 @@ ; CHECK-P8-NEXT: clrldi r3, r3, 32 ; CHECK-P8-NEXT: bl __floatunsikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -2122,8 +2063,7 @@ ; CHECK-P8-NEXT: mffprd r3, f0 ; CHECK-P8-NEXT: bl __floatundikf ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r4, 8(r30) -; CHECK-P8-NEXT: std r3, 0(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -2160,9 +2100,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __fixkfti ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -2199,9 +2137,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __fixunskfti ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 @@ -2230,10 +2166,8 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: bl __fixunskfsi +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: bl __fixkfsi ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 ; CHECK-P8-NEXT: ld r0, 16(r1) @@ -2261,9 +2195,7 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r4, 8(r3) -; CHECK-P8-NEXT: mr r3, r5 +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: bl __fixkfsi ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: addi r1, r1, 32 diff --git a/llvm/test/CodeGen/PowerPC/f128-rounding.ll b/llvm/test/CodeGen/PowerPC/f128-rounding.ll --- a/llvm/test/CodeGen/PowerPC/f128-rounding.ll +++ b/llvm/test/CodeGen/PowerPC/f128-rounding.ll @@ -2,7 +2,7 @@ ; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs \ ; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s ; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs \ -; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s \ +; RUN: -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s -enable-soft-fp128 | FileCheck %s \ ; RUN: -check-prefix=CHECK-P8 define void @qp_trunc(fp128* nocapture readonly %a, fp128* nocapture %res) { @@ -22,15 +22,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl truncf128 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -61,15 +57,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl rintf128 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -100,15 +92,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl nearbyintf128 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -139,15 +127,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl roundf128 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -178,15 +162,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl floorf128 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload @@ -217,15 +197,11 @@ ; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -48(r1) -; CHECK-P8-NEXT: ld r5, 0(r3) -; CHECK-P8-NEXT: ld r6, 8(r3) +; CHECK-P8-NEXT: lvx v2, 0, r3 ; CHECK-P8-NEXT: mr r30, r4 -; CHECK-P8-NEXT: mr r3, r5 -; CHECK-P8-NEXT: mr r4, r6 ; CHECK-P8-NEXT: bl ceilf128 ; CHECK-P8-NEXT: nop -; CHECK-P8-NEXT: std r3, 0(r30) -; CHECK-P8-NEXT: std r4, 8(r30) +; CHECK-P8-NEXT: stvx v2, 0, r30 ; CHECK-P8-NEXT: addi r1, r1, 48 ; CHECK-P8-NEXT: ld r0, 16(r1) ; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll b/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll --- a/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll +++ b/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll @@ -6,7 +6,7 @@ ; RUN: < %s -mtriple=powerpc64le-unknown-linux -mcpu=pwr9 | FileCheck %s \ ; RUN: -check-prefix=P9 ; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ -; RUN: < %s -mtriple=powerpc64le-unknown-linux -mcpu=pwr8 -mattr=-vsx \ +; RUN: < %s -mtriple=powerpc64le-unknown-linux -mcpu=pwr8 -enable-soft-fp128 -mattr=-vsx \ ; RUN: | FileCheck %s -check-prefix=NOVSX declare i1 @llvm.experimental.constrained.fptosi.i1.f128(fp128, metadata) @@ -203,7 +203,7 @@ ; NOVSX-NEXT: stdu r1, -32(r1) ; NOVSX-NEXT: .cfi_def_cfa_offset 32 ; NOVSX-NEXT: .cfi_offset lr, 16 -; NOVSX-NEXT: bl __fixunskfsi +; NOVSX-NEXT: bl __fixkfsi ; NOVSX-NEXT: nop ; NOVSX-NEXT: addi r1, r1, 32 ; NOVSX-NEXT: ld r0, 16(r1) @@ -806,7 +806,7 @@ ; NOVSX-NEXT: stdu r1, -32(r1) ; NOVSX-NEXT: .cfi_def_cfa_offset 32 ; NOVSX-NEXT: .cfi_offset lr, 16 -; NOVSX-NEXT: bl __floatunsikf +; NOVSX-NEXT: bl __floatsikf ; NOVSX-NEXT: nop ; NOVSX-NEXT: addi r1, r1, 32 ; NOVSX-NEXT: ld r0, 16(r1) diff --git a/llvm/test/CodeGen/X86/fp128-load.ll b/llvm/test/CodeGen/X86/fp128-load.ll --- a/llvm/test/CodeGen/X86/fp128-load.ll +++ b/llvm/test/CodeGen/X86/fp128-load.ll @@ -22,14 +22,9 @@ define fp128 @TestLoadExtend(fp128 %x, i32 %n) { ; CHECK-LABEL: TestLoadExtend: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: movslq %edi, %rax ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: callq __extendsftf2 -; CHECK-NEXT: popq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq +; CHECK-NEXT: jmp __extendsftf2@PLT # TAILCALL entry: %idxprom = sext i32 %n to i64 %arrayidx = getelementptr inbounds [2 x float], [2 x float]* @TestLoadExtend.data, i64 0, i64 %idxprom