diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1812,6 +1812,19 @@
 SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
                                                EVT DestVT, const SDLoc &dl,
                                                SDValue Chain) {
+  unsigned SrcSize = SrcOp.getValueSizeInBits();
+  unsigned SlotSize = SlotVT.getSizeInBits();
+  unsigned DestSize = DestVT.getSizeInBits();
+  Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
+  Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType);
+
+  // Don't convert with stack if the load/store is expensive.
+  if ((SrcSize > SlotSize &&
+       !TLI.isTruncStoreLegalOrCustom(SrcOp.getValueType(), SlotVT)) ||
+      (SlotSize < DestSize &&
+       !TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, DestVT, SlotVT)))
+    return SDValue();
+
   // Create the stack frame object.
   Align SrcAlign = DAG.getDataLayout().getPrefTypeAlign(
       SrcOp.getValueType().getTypeForEVT(*DAG.getContext()));
@@ -1822,12 +1835,6 @@
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
 
-  unsigned SrcSize = SrcOp.getValueSizeInBits();
-  unsigned SlotSize = SlotVT.getSizeInBits();
-  unsigned DestSize = DestVT.getSizeInBits();
-  Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
-  Align DestAlign = DAG.getDataLayout().getPrefTypeAlign(DestType);
-
   // Emit a store to the stack slot.  Use a truncstore if the input value is
   // later than DestVT.
   SDValue Store;
@@ -2415,7 +2422,11 @@
 
   // TODO: Should any fast-math-flags be set for the created nodes?
   LLVM_DEBUG(dbgs() << "Legalizing INT_TO_FP\n");
-  if (SrcVT == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
+  if (SrcVT == MVT::i32 && TLI.isTypeLegal(MVT::f64) &&
+      (DestVT.bitsLE(MVT::f64) ||
+       TLI.isOperationLegal(Node->isStrictFPOpcode() ? ISD::STRICT_FP_EXTEND
+                                                     : ISD::FP_EXTEND,
+                            DestVT))) {
     LLVM_DEBUG(dbgs() << "32-bit [signed|unsigned] integer to float/double "
                          "expansion\n");
 
@@ -2477,8 +2488,9 @@
     }
     return Result;
   }
-  // Code below here assumes !isSigned without checking again.
-  assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
+
+  if (isSigned)
+    return SDValue();
 
   // TODO: Generalize this for use with other types.
   if (((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) ||
@@ -2537,6 +2549,11 @@
     return DAG.getSelect(dl, DestVT, SignBitTest, Slow, Fast);
   }
 
+  // Don't expand it if there isn't cheap fadd.
+  if (!TLI.isOperationLegalOrCustom(
+          Node->isStrictFPOpcode() ? ISD::STRICT_FADD : ISD::FADD, DestVT))
+    return SDValue();
+
   // The following optimization is valid only if every value in SrcVT (when
   // treated as signed) is representable in DestVT.  Check that the mantissa
   // size of DestVT is >= than the number of bits in SrcVT -1.
@@ -2563,7 +2580,8 @@
   // offset depending on the data type.
   uint64_t FF;
   switch (SrcVT.getSimpleVT().SimpleTy) {
-  default: llvm_unreachable("Unsupported integer type!");
+  default:
+    return SDValue();
   case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
   case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
   case MVT::i32: FF = 0x4F800000ULL; break;  // 2^32 (as a float)
@@ -3034,16 +3052,19 @@
       break;
     // We fall back to use stack operation when the FP_ROUND operation
     // isn't available.
-    Tmp1 = EmitStackConvert(Node->getOperand(1), Node->getValueType(0),
-                            Node->getValueType(0), dl, Node->getOperand(0));
-    ReplaceNode(Node, Tmp1.getNode());
-    LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_ROUND node\n");
-    return true;
+    if ((Tmp1 = EmitStackConvert(Node->getOperand(1), Node->getValueType(0),
+                                 Node->getValueType(0), dl,
+                                 Node->getOperand(0)))) {
+      ReplaceNode(Node, Tmp1.getNode());
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_ROUND node\n");
+      return true;
+    }
+    break;
   case ISD::FP_ROUND:
   case ISD::BITCAST:
-    Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
-                            Node->getValueType(0), dl);
-    Results.push_back(Tmp1);
+    if ((Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getValueType(0), dl)))
+      Results.push_back(Tmp1);
     break;
   case ISD::STRICT_FP_EXTEND:
     // When strict mode is enforced we can't do expansion because it
@@ -3058,17 +3079,19 @@
       break;
     // We fall back to use stack operation when the FP_EXTEND operation
     // isn't available.
-    Tmp1 = EmitStackConvert(Node->getOperand(1),
-                            Node->getOperand(1).getValueType(),
-                            Node->getValueType(0), dl, Node->getOperand(0));
-    ReplaceNode(Node, Tmp1.getNode());
-    LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_EXTEND node\n");
-    return true;
+    if ((Tmp1 = EmitStackConvert(
+             Node->getOperand(1), Node->getOperand(1).getValueType(),
+             Node->getValueType(0), dl, Node->getOperand(0)))) {
+      ReplaceNode(Node, Tmp1.getNode());
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_EXTEND node\n");
+      return true;
+    }
+    break;
   case ISD::FP_EXTEND:
-    Tmp1 = EmitStackConvert(Node->getOperand(0),
-                            Node->getOperand(0).getValueType(),
-                            Node->getValueType(0), dl);
-    Results.push_back(Tmp1);
+    if ((Tmp1 = EmitStackConvert(Node->getOperand(0),
+                                 Node->getOperand(0).getValueType(),
+                                 Node->getValueType(0), dl)))
+      Results.push_back(Tmp1);
     break;
   case ISD::SIGN_EXTEND_INREG: {
     EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
@@ -3113,10 +3136,11 @@
     LLVM_FALLTHROUGH;
   case ISD::SINT_TO_FP:
   case ISD::STRICT_SINT_TO_FP:
-    Tmp1 = ExpandLegalINT_TO_FP(Node, Tmp2);
-    Results.push_back(Tmp1);
-    if (Node->isStrictFPOpcode())
-      Results.push_back(Tmp2);
+    if ((Tmp1 = ExpandLegalINT_TO_FP(Node, Tmp2))) {
+      Results.push_back(Tmp1);
+      if (Node->isStrictFPOpcode())
+        Results.push_back(Tmp2);
+    }
     break;
   case ISD::FP_TO_SINT:
     if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
@@ -4001,6 +4025,8 @@
       Results.push_back(Fadd.getValue(1));
       break;
     }
+    case ISD::STRICT_SINT_TO_FP:
+    case ISD::STRICT_UINT_TO_FP:
     case ISD::STRICT_LRINT:
     case ISD::STRICT_LLRINT:
     case ISD::STRICT_LROUND:
@@ -4333,11 +4359,132 @@
     Results.push_back(ExpandLibCall(LC, Node, false));
     break;
   }
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP: {
+    // TODO - Common the code with DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP
+    bool IsStrict = Node->isStrictFPOpcode();
+    bool Signed = Node->getOpcode() == ISD::SINT_TO_FP ||
+                  Node->getOpcode() == ISD::STRICT_SINT_TO_FP;
+    EVT SVT = Node->getOperand(IsStrict ? 1 : 0).getValueType();
+    EVT RVT = Node->getValueType(0);
+    EVT NVT = EVT();
+    SDLoc dl(Node);
+
+    // Even if the input is legal, no libcall may exactly match, eg. we don't
+    // have i1 -> fp conversions. So, it needs to be promoted to a larger type,
+    // eg: i13 -> fp. Then, look for an appropriate libcall.
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE;
+         t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
+         ++t) {
+      NVT = (MVT::SimpleValueType)t;
+      // The source needs to big enough to hold the operand.
+      if (NVT.bitsGE(SVT))
+        LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT)
+                    : RTLIB::getUINTTOFP(NVT, RVT);
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
+    SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+    // Sign/zero extend the argument if the libcall takes a larger type.
+    SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                             NVT, Node->getOperand(IsStrict ? 1 : 0));
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(Signed);
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, dl, Chain);
+    Results.push_back(Tmp.first);
+    if (IsStrict)
+      Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT: {
+    // TODO - Common the code with DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT.
+    bool IsStrict = Node->isStrictFPOpcode();
+    bool Signed = Node->getOpcode() == ISD::FP_TO_SINT ||
+                  Node->getOpcode() == ISD::STRICT_FP_TO_SINT;
+
+    SDValue Op = Node->getOperand(IsStrict ? 1 : 0);
+    EVT SVT = Op.getValueType();
+    EVT RVT = Node->getValueType(0);
+    EVT NVT = EVT();
+    SDLoc dl(Node);
+
+    // Even if the result is legal, no libcall may exactly match, eg. we don't
+    // have fp -> i1 conversions. So, it needs to be promoted to a larger type,
+    // eg: fp -> i32. Then, look for an appropriate libcall.
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
+         IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
+         ++IntVT) {
+      NVT = (MVT::SimpleValueType)IntVT;
+      // The type needs to big enough to hold the result.
+      if (NVT.bitsGE(RVT))
+        LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT)
+                    : RTLIB::getFPTOUINT(SVT, NVT);
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
+    SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+    TargetLowering::MakeLibCallOptions CallOptions;
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl, Chain);
+
+    // Truncate the result if the libcall returns a larger type.
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, RVT, Tmp.first));
+    if (IsStrict)
+      Results.push_back(Tmp.second);
+    break;
+  }
+
+  case ISD::FP_ROUND:
+  case ISD::STRICT_FP_ROUND: {
+    // X = FP_ROUND(Y, TRUNC)
+    // TRUNC is a flag, which is always an integer that is zero or one.
+    // If TRUNC is 0, this is a normal rounding, if it is 1, this FP_ROUND
+    // is known to not change the value of Y.
+    // We can only expand it into libcall if the TRUNC is 0.
+    bool IsStrict = Node->isStrictFPOpcode();
+    SDValue Op = Node->getOperand(IsStrict ? 1 : 0);
+    SDValue Chain = IsStrict ? Node->getOperand(0) : SDValue();
+    EVT VT = Node->getValueType(0);
+    const ConstantSDNode *Trunc =
+        cast<ConstantSDNode>(Node->getOperand(IsStrict ? 2 : 1));
+    assert(Trunc->isNullValue() &&
+           "Unable to expand as libcall if it is not normal rounding");
+
+    RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), VT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
+    TargetLowering::MakeLibCallOptions CallOptions;
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, SDLoc(Node), Chain);
+    Results.push_back(Tmp.first);
+    if (IsStrict)
+      Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::FP_EXTEND: {
+    Results.push_back(
+        ExpandLibCall(RTLIB::getFPEXT(Node->getOperand(0).getValueType(),
+                                      Node->getValueType(0)),
+                      Node, false));
+    break;
+  }
+  case ISD::STRICT_FP_EXTEND:
   case ISD::STRICT_FP_TO_FP16: {
     RTLIB::Libcall LC =
-        RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16);
-    assert(LC != RTLIB::UNKNOWN_LIBCALL &&
-           "Unable to expand strict_fp_to_fp16");
+        Node->getOpcode() == ISD::STRICT_FP_TO_FP16
+            ? RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16)
+            : RTLIB::getFPEXT(Node->getOperand(1).getValueType(),
+                              Node->getValueType(0));
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to legalize as libcall");
+
     TargetLowering::MakeLibCallOptions CallOptions;
     std::pair<SDValue, SDValue> Tmp =
         TLI.makeLibCall(DAG, LC, Node->getValueType(0), Node->getOperand(1),
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6477,6 +6477,11 @@
     return true;
   }
 
+  // Don't expand it if there isn't cheap fsub instruction.
+  if (!isOperationLegalOrCustom(
+          Node->isStrictFPOpcode() ? ISD::STRICT_FSUB : ISD::FSUB, SrcVT))
+    return false;
+
   SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
   SDValue Sel;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -907,8 +907,6 @@
   SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                        RTLIB::Libcall Call) const;
   SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -378,12 +378,12 @@
   // Virtually no operation on f128 is legal, but LLVM can't expand them when
   // there's a valid register class, so we need custom operations in most cases.
   setOperationAction(ISD::FABS, MVT::f128, Expand);
-  setOperationAction(ISD::FADD, MVT::f128, Custom);
+  setOperationAction(ISD::FADD, MVT::f128, LibCall);
   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
   setOperationAction(ISD::FCOS, MVT::f128, Expand);
-  setOperationAction(ISD::FDIV, MVT::f128, Custom);
+  setOperationAction(ISD::FDIV, MVT::f128, LibCall);
   setOperationAction(ISD::FMA, MVT::f128, Expand);
-  setOperationAction(ISD::FMUL, MVT::f128, Custom);
+  setOperationAction(ISD::FMUL, MVT::f128, LibCall);
   setOperationAction(ISD::FNEG, MVT::f128, Expand);
   setOperationAction(ISD::FPOW, MVT::f128, Expand);
   setOperationAction(ISD::FREM, MVT::f128, Expand);
@@ -391,7 +391,7 @@
   setOperationAction(ISD::FSIN, MVT::f128, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
-  setOperationAction(ISD::FSUB, MVT::f128, Custom);
+  setOperationAction(ISD::FSUB, MVT::f128, LibCall);
   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
   setOperationAction(ISD::SETCC, MVT::f128, Custom);
   setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
@@ -2850,20 +2850,6 @@
   return std::make_pair(Value, Overflow);
 }
 
-SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                                             RTLIB::Libcall Call) const {
-  bool IsStrict = Op->isStrictFPOpcode();
-  unsigned Offset = IsStrict ? 1 : 0;
-  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
-  SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
-  MakeLibCallOptions CallOptions;
-  SDValue Result;
-  SDLoc dl(Op);
-  std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops,
-                                        CallOptions, dl, Chain);
-  return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
-}
-
 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
   if (useSVEForFixedLengthVectorVT(Op.getValueType()))
     return LowerToScalableOp(Op, DAG);
@@ -3046,11 +3032,7 @@
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
 
   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
-
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
@@ -3071,19 +3053,7 @@
     return Op;
   }
 
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPROUND(SrcVT, Op.getValueType());
-
-  // FP_ROUND node has a second operand indicating whether it is known to be
-  // precise. That doesn't take part in the LibCall so we can't directly use
-  // LowerF128Call.
-  MakeLibCallOptions CallOptions;
-  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
-  SDValue Result;
-  SDLoc dl(Op);
-  std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
-                                        CallOptions, dl, Chain);
-  return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
@@ -3158,14 +3128,7 @@
     return Op;
   }
 
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::FP_TO_SINT ||
-      Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
-    LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
@@ -3241,15 +3204,7 @@
   // fp128.
   if (Op.getValueType() != MVT::f128)
     return Op;
-
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::SINT_TO_FP ||
-      Op.getOpcode() == ISD::STRICT_SINT_TO_FP)
-    LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
@@ -4165,22 +4120,14 @@
   case ISD::UMULO:
     return LowerXALUO(Op, DAG);
   case ISD::FADD:
-    if (Op.getValueType() == MVT::f128)
-      return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
   case ISD::FSUB:
-    if (Op.getValueType() == MVT::f128)
-      return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
   case ISD::FMUL:
-    if (Op.getValueType() == MVT::f128)
-      return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
   case ISD::FMA:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
   case ISD::FDIV:
-    if (Op.getValueType() == MVT::f128)
-      return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
   case ISD::FNEG:
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1161,6 +1161,7 @@
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -610,6 +610,9 @@
   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
   setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
 
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+
   if (Subtarget.has64BitSupport()) {
     // They also have instructions for converting between i64 and fp.
     setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
@@ -1184,8 +1187,11 @@
       AddPromotedToType(ISD::LOAD, MVT::f128, MVT::v4i32);
       AddPromotedToType(ISD::STORE, MVT::f128, MVT::v4i32);
 
-      setOperationAction(ISD::FADD, MVT::f128, Expand);
-      setOperationAction(ISD::FSUB, MVT::f128, Expand);
+      // Set FADD/FSUB as libcall to avoid the legalizer to expand the
+      // fp_to_uint and int_to_fp.
+      setOperationAction(ISD::FADD, MVT::f128, LibCall);
+      setOperationAction(ISD::FSUB, MVT::f128, LibCall);
+
       setOperationAction(ISD::FMUL, MVT::f128, Expand);
       setOperationAction(ISD::FDIV, MVT::f128, Expand);
       setOperationAction(ISD::FNEG, MVT::f128, Expand);
@@ -1198,6 +1204,19 @@
       setOperationAction(ISD::FSQRT, MVT::f128, Expand);
       setOperationAction(ISD::FMA, MVT::f128, Expand);
       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+
+      setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+      setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+
+      // Expand the fp_extend if the target type is fp128.
+      setOperationAction(ISD::FP_EXTEND, MVT::f128, Expand);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Expand);
+
+      // Expand the fp_round if the source type is fp128.
+      for (MVT VT : {MVT::f32, MVT::f64}) {
+        setOperationAction(ISD::FP_ROUND, VT, Custom);
+        setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);
+      }
     }
 
     if (Subtarget.hasP9Altivec()) {
@@ -8381,7 +8400,7 @@
 
   // FP to INT conversions are legal for f128.
   if (SrcVT == MVT::f128)
-    return Op;
+    return Subtarget.hasP9Vector() ? Op : SDValue();
 
   // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
   // PPC (the libcall is not available).
@@ -8734,7 +8753,7 @@
 
   // Conversions to f128 are legal.
   if (Op.getValueType() == MVT::f128)
-    return Op;
+    return Subtarget.hasP9Vector() ? Op : SDValue();
 
   // Don't handle ppc_fp128 here; let it be lowered to a libcall.
   if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
@@ -10950,6 +10969,15 @@
   }
 }
 
+SDValue PPCTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+  bool IsStrict = Op->isStrictFPOpcode();
+  if (Op.getOperand(IsStrict ? 1 : 0).getValueType() == MVT::f128 &&
+      !Subtarget.hasP9Vector())
+    return SDValue();
+
+  return Op;
+}
+
 // Custom lowering for fpext vf32 to v2f64
 SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 
@@ -11086,6 +11114,9 @@
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
+  case ISD::STRICT_FP_ROUND:
+  case ISD::FP_ROUND:
+    return LowerFP_ROUND(Op, DAG);
   case ISD::ROTL:               return LowerROTL(Op, DAG);
 
   // For counter-based loop handling.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1524,9 +1524,6 @@
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 
-    SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                          RTLIB::Libcall Call) const;
-
     SDValue
     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                          const SmallVectorImpl<ISD::InputArg> &Ins,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19870,7 +19870,7 @@
   }
 
   if (VT == MVT::f128)
-    return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
+    return SDValue();
 
   SDValue ValueToStore = Src;
   if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
@@ -20311,7 +20311,7 @@
   SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
 
   if (DstVT == MVT::f128)
-    return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
+    return SDValue();
 
   if (DstVT.isVector())
     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
@@ -21391,10 +21391,8 @@
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
   MVT SVT = In.getSimpleValueType();
 
-  if (VT == MVT::f128) {
-    RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
-    return LowerF128Call(Op, DAG, LC);
-  }
+  if (VT == MVT::f128)
+    return SDValue();
 
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
@@ -21408,31 +21406,12 @@
 
 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
-
-  MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
-  MVT SVT = In.getSimpleValueType();
-
   // It's legal except when f128 is involved
-  if (SVT != MVT::f128)
+  if (In.getSimpleValueType() != MVT::f128)
     return Op;
 
-  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, VT);
-
-  // FP_ROUND node has a second operand indicating whether it is known to be
-  // precise. That doesn't take part in the LibCall so we can't directly use
-  // LowerF128Call.
-
-  SDLoc dl(Op);
-  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
-  MakeLibCallOptions CallOptions;
-  std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
-                                                dl, Chain);
-
-  if (IsStrict)
-    return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
-
-  return Tmp.first;
+  return SDValue();
 }
 
 static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -29719,25 +29698,6 @@
   return NOOP;
 }
 
-SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                                         RTLIB::Libcall Call) const {
-
-  bool IsStrict = Op->isStrictFPOpcode();
-  unsigned Offset = IsStrict ? 1 : 0;
-  SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
-
-  SDLoc dl(Op);
-  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
-  MakeLibCallOptions CallOptions;
-  std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
-                                                CallOptions, dl, Chain);
-
-  if (IsStrict)
-    return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
-
-  return Tmp.first;
-}
-
 // Custom split CVTPS2PH with wide types.
 static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
   SDLoc dl(Op);
diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
--- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
@@ -7,16 +7,11 @@
 define fp128 @test_add() {
 ; CHECK-LABEL: test_add:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    adrp x8, lhs
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:lhs]
 ; CHECK-NEXT:    adrp x8, rhs
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:rhs]
-; CHECK-NEXT:    bl __addtf3
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b __addtf3
 
   %lhs = load fp128, fp128* @lhs, align 16
   %rhs = load fp128, fp128* @rhs, align 16
@@ -28,16 +23,11 @@
 define fp128 @test_sub() {
 ; CHECK-LABEL: test_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    adrp x8, lhs
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:lhs]
 ; CHECK-NEXT:    adrp x8, rhs
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:rhs]
-; CHECK-NEXT:    bl __subtf3
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b __subtf3
 
   %lhs = load fp128, fp128* @lhs, align 16
   %rhs = load fp128, fp128* @rhs, align 16
@@ -49,16 +39,11 @@
 define fp128 @test_mul() {
 ; CHECK-LABEL: test_mul:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    adrp x8, lhs
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:lhs]
 ; CHECK-NEXT:    adrp x8, rhs
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:rhs]
-; CHECK-NEXT:    bl __multf3
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b __multf3
 
   %lhs = load fp128, fp128* @lhs, align 16
   %rhs = load fp128, fp128* @rhs, align 16
@@ -70,16 +55,11 @@
 define fp128 @test_div() {
 ; CHECK-LABEL: test_div:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    adrp x8, lhs
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:lhs]
 ; CHECK-NEXT:    adrp x8, rhs
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:rhs]
-; CHECK-NEXT:    bl __divtf3
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b __divtf3
 
   %lhs = load fp128, fp128* @lhs, align 16
   %rhs = load fp128, fp128* @rhs, align 16
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization-strict.ll
@@ -73,13 +73,10 @@
 define fp128 @test_v1f128(<1 x fp128> %a, fp128 %s) nounwind {
 ; CHECK-LABEL: test_v1f128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    mov v2.16b, v0.16b
 ; CHECK-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    bl __addtf3
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b __addtf3
   %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 %s, <1 x fp128> %a)
   ret fp128 %b
 }
@@ -151,10 +148,9 @@
 ; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    bl __addtf3
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #32 // =32
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b __addtf3
   %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 %s, <2 x fp128> %a)
   ret fp128 %b
 }
@@ -162,10 +158,7 @@
 define fp128 @test_v2f128_neutral(<2 x fp128> %a) nounwind {
 ; CHECK-LABEL: test_v2f128_neutral:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    bl __addtf3
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b __addtf3
   %b = call fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 0xL00000000000000008000000000000000, <2 x fp128> %a)
   ret fp128 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-legalization.ll
@@ -84,10 +84,7 @@
 define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-LABEL: test_v2f128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    bl __addtf3
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b __addtf3
   %b = call reassoc fp128 @llvm.vector.reduce.fadd.f128.v2f128(fp128 0xL00000000000000008000000000000000, <2 x fp128> %a)
   ret fp128 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-legalization-strict.ll
@@ -59,10 +59,7 @@
 define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
 ; CHECK-LABEL: test_v2f128:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    bl __multf3
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-NEXT:    b __multf3
   %b = call fp128 @llvm.vector.reduce.fmul.f128.v2f128(fp128 0xL00000000000000003fff00000000000000, <2 x fp128> %a)
   ret fp128 %b
 }
diff --git a/llvm/test/CodeGen/PowerPC/f128-conv.ll b/llvm/test/CodeGen/PowerPC/f128-conv.ll
--- a/llvm/test/CodeGen/PowerPC/f128-conv.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-conv.ll
@@ -4,7 +4,7 @@
 ; RUN:   | FileCheck %s
 ; RUN: llc -relocation-model=pic -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown \
 ; RUN:   -ppc-vsr-nums-as-vr -verify-machineinstrs -ppc-asm-full-reg-names < %s \
-; RUN:   | FileCheck %s -check-prefix=CHECK-P8
+; RUN:   -enable-soft-fp128 | FileCheck %s -check-prefix=CHECK-P8
 
 @mem = global [5 x i64] [i64 56, i64 63, i64 3, i64 5, i64 6], align 8
 @umem = global [5 x i64] [i64 560, i64 100, i64 34, i64 2, i64 5], align 8
@@ -35,8 +35,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatdikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -86,8 +85,7 @@
 ; CHECK-P8-NEXT:    mr r4, r5
 ; CHECK-P8-NEXT:    bl __floattikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -127,8 +125,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatdikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -166,8 +163,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatdikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -204,12 +200,13 @@
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
-; CHECK-P8-NEXT:    clrldi r3, r4, 63
-; CHECK-P8-NEXT:    neg r3, r3
+; CHECK-P8-NEXT:    andi. r3, r4, 1
+; CHECK-P8-NEXT:    li r4, -1
+; CHECK-P8-NEXT:    li r3, 0
+; CHECK-P8-NEXT:    iselgt r3, r4, r3
 ; CHECK-P8-NEXT:    bl __floatsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -244,8 +241,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatundikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -295,8 +291,7 @@
 ; CHECK-P8-NEXT:    mr r4, r5
 ; CHECK-P8-NEXT:    bl __floatuntikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -336,8 +331,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatundikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -375,8 +369,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatundikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -411,10 +404,9 @@
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
 ; CHECK-P8-NEXT:    mr r30, r3
 ; CHECK-P8-NEXT:    clrldi r3, r4, 63
-; CHECK-P8-NEXT:    bl __floatunsikf
+; CHECK-P8-NEXT:    bl __floatsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -454,10 +446,8 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatdikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    mr r5, r3
 ; CHECK-P8-NEXT:    mr r3, r30
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r5, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -501,10 +491,8 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatundikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    mr r5, r3
 ; CHECK-P8-NEXT:    mr r3, r30
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r5, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -543,8 +531,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -580,8 +567,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -623,8 +609,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -661,8 +646,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -698,8 +682,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -741,8 +724,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -783,8 +765,7 @@
 ; CHECK-P8-NEXT:    clrldi r3, r3, 32
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -822,8 +803,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -855,12 +835,12 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
+; CHECK-P8-NEXT:    lhz r4, 0(r4)
 ; CHECK-P8-NEXT:    mr r30, r3
-; CHECK-P8-NEXT:    lhz r3, 0(r4)
+; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -895,14 +875,14 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
+; CHECK-P8-NEXT:    addis r4, r2, .LC4@toc@ha
 ; CHECK-P8-NEXT:    mr r30, r3
-; CHECK-P8-NEXT:    addis r3, r2, .LC4@toc@ha
-; CHECK-P8-NEXT:    ld r3, .LC4@toc@l(r3)
-; CHECK-P8-NEXT:    lhz r3, 6(r3)
+; CHECK-P8-NEXT:    ld r4, .LC4@toc@l(r4)
+; CHECK-P8-NEXT:    lhz r4, 6(r4)
+; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -943,8 +923,7 @@
 ; CHECK-P8-NEXT:    clrldi r3, r3, 32
 ; CHECK-P8-NEXT:    bl __floatsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -984,8 +963,7 @@
 ; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1016,12 +994,12 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
+; CHECK-P8-NEXT:    lbz r4, 0(r4)
 ; CHECK-P8-NEXT:    mr r30, r3
-; CHECK-P8-NEXT:    lbz r3, 0(r4)
+; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1056,14 +1034,14 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
+; CHECK-P8-NEXT:    addis r4, r2, .LC5@toc@ha
 ; CHECK-P8-NEXT:    mr r30, r3
-; CHECK-P8-NEXT:    addis r3, r2, .LC5@toc@ha
-; CHECK-P8-NEXT:    ld r3, .LC5@toc@l(r3)
-; CHECK-P8-NEXT:    lbz r3, 2(r3)
+; CHECK-P8-NEXT:    ld r4, .LC5@toc@l(r4)
+; CHECK-P8-NEXT:    lbz r4, 2(r4)
+; CHECK-P8-NEXT:    mr r3, r4
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1104,8 +1082,7 @@
 ; CHECK-P8-NEXT:    clrldi r3, r3, 32
 ; CHECK-P8-NEXT:    bl __floatsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1148,9 +1125,7 @@
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r4, 8(r3)
-; CHECK-P8-NEXT:    mr r3, r5
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    bl __trunckfdf2
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
@@ -1186,9 +1161,7 @@
 ; CHECK-P8-NEXT:    addis r4, r2, .LC6@toc@ha
 ; CHECK-P8-NEXT:    mr r30, r3
 ; CHECK-P8-NEXT:    ld r4, .LC6@toc@l(r4)
-; CHECK-P8-NEXT:    ld r5, 0(r4)
-; CHECK-P8-NEXT:    ld r4, 8(r4)
-; CHECK-P8-NEXT:    mr r3, r5
+; CHECK-P8-NEXT:    lvx v2, 0, r4
 ; CHECK-P8-NEXT:    bl __trunckfdf2
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    stfdx f1, 0, r30
@@ -1231,9 +1204,7 @@
 ; CHECK-P8-NEXT:    addis r4, r2, .LC7@toc@ha
 ; CHECK-P8-NEXT:    mr r29, r3
 ; CHECK-P8-NEXT:    ld r4, .LC7@toc@l(r4)
-; CHECK-P8-NEXT:    ld r5, 0(r4)
-; CHECK-P8-NEXT:    ld r4, 8(r4)
-; CHECK-P8-NEXT:    mr r3, r5
+; CHECK-P8-NEXT:    lvx v2, 0, r4
 ; CHECK-P8-NEXT:    bl __trunckfdf2
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    sldi r3, r30, 3
@@ -1273,14 +1244,9 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
-; CHECK-P8-NEXT:    ld r9, 0(r3)
-; CHECK-P8-NEXT:    ld r7, 8(r3)
-; CHECK-P8-NEXT:    ld r8, 0(r4)
-; CHECK-P8-NEXT:    ld r6, 8(r4)
+; CHECK-P8-NEXT:    lvx v2, 0, r3
+; CHECK-P8-NEXT:    lvx v3, 0, r4
 ; CHECK-P8-NEXT:    mr r30, r5
-; CHECK-P8-NEXT:    mr r3, r9
-; CHECK-P8-NEXT:    mr r4, r7
-; CHECK-P8-NEXT:    mr r5, r8
 ; CHECK-P8-NEXT:    bl __addkf3
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    bl __trunckfdf2
@@ -1318,9 +1284,7 @@
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r4, 8(r3)
-; CHECK-P8-NEXT:    mr r3, r5
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    bl __trunckfsf2
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
@@ -1357,9 +1321,7 @@
 ; CHECK-P8-NEXT:    addis r4, r2, .LC6@toc@ha
 ; CHECK-P8-NEXT:    mr r30, r3
 ; CHECK-P8-NEXT:    ld r4, .LC6@toc@l(r4)
-; CHECK-P8-NEXT:    ld r5, 0(r4)
-; CHECK-P8-NEXT:    ld r4, 8(r4)
-; CHECK-P8-NEXT:    mr r3, r5
+; CHECK-P8-NEXT:    lvx v2, 0, r4
 ; CHECK-P8-NEXT:    bl __trunckfsf2
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    stfsx f1, 0, r30
@@ -1403,9 +1365,8 @@
 ; CHECK-P8-NEXT:    addis r4, r2, .LC7@toc@ha
 ; CHECK-P8-NEXT:    mr r29, r3
 ; CHECK-P8-NEXT:    ld r4, .LC7@toc@l(r4)
-; CHECK-P8-NEXT:    ld r5, 48(r4)
-; CHECK-P8-NEXT:    ld r4, 56(r4)
-; CHECK-P8-NEXT:    mr r3, r5
+; CHECK-P8-NEXT:    addi r4, r4, 48
+; CHECK-P8-NEXT:    lvx v2, 0, r4
 ; CHECK-P8-NEXT:    bl __trunckfsf2
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    sldi r3, r30, 2
@@ -1446,14 +1407,9 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
-; CHECK-P8-NEXT:    ld r9, 0(r3)
-; CHECK-P8-NEXT:    ld r7, 8(r3)
-; CHECK-P8-NEXT:    ld r8, 0(r4)
-; CHECK-P8-NEXT:    ld r6, 8(r4)
+; CHECK-P8-NEXT:    lvx v2, 0, r3
+; CHECK-P8-NEXT:    lvx v3, 0, r4
 ; CHECK-P8-NEXT:    mr r30, r5
-; CHECK-P8-NEXT:    mr r3, r9
-; CHECK-P8-NEXT:    mr r4, r7
-; CHECK-P8-NEXT:    mr r5, r8
 ; CHECK-P8-NEXT:    bl __addkf3
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    bl __trunckfsf2
@@ -1522,10 +1478,9 @@
 ; CHECK-P8-NEXT:    lfdx f1, 0, r3
 ; CHECK-P8-NEXT:    bl __extenddfkf2
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    addis r5, r2, .LC8@toc@ha
-; CHECK-P8-NEXT:    ld r5, .LC8@toc@l(r5)
-; CHECK-P8-NEXT:    std r4, 8(r5)
-; CHECK-P8-NEXT:    std r3, 0(r5)
+; CHECK-P8-NEXT:    addis r3, r2, .LC8@toc@ha
+; CHECK-P8-NEXT:    ld r3, .LC8@toc@l(r3)
+; CHECK-P8-NEXT:    stvx v2, 0, r3
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    mtlr r0
@@ -1560,10 +1515,9 @@
 ; CHECK-P8-NEXT:    lfdx f1, r3, r4
 ; CHECK-P8-NEXT:    bl __extenddfkf2
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    addis r5, r2, .LC8@toc@ha
-; CHECK-P8-NEXT:    ld r5, .LC8@toc@l(r5)
-; CHECK-P8-NEXT:    std r4, 8(r5)
-; CHECK-P8-NEXT:    std r3, 0(r5)
+; CHECK-P8-NEXT:    addis r3, r2, .LC8@toc@ha
+; CHECK-P8-NEXT:    ld r3, .LC8@toc@l(r3)
+; CHECK-P8-NEXT:    stvx v2, 0, r3
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    mtlr r0
@@ -1602,9 +1556,8 @@
 ; CHECK-P8-NEXT:    mr r29, r3
 ; CHECK-P8-NEXT:    bl __extenddfkf2
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    sldi r5, r30, 4
-; CHECK-P8-NEXT:    stdux r3, r29, r5
-; CHECK-P8-NEXT:    std r4, 8(r29)
+; CHECK-P8-NEXT:    sldi r3, r30, 4
+; CHECK-P8-NEXT:    stvx v2, r29, r3
 ; CHECK-P8-NEXT:    addi r1, r1, 64
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1640,8 +1593,7 @@
 ; CHECK-P8-NEXT:    mr r30, r4
 ; CHECK-P8-NEXT:    bl __extenddfkf2
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1700,10 +1652,9 @@
 ; CHECK-P8-NEXT:    lfsx f1, 0, r3
 ; CHECK-P8-NEXT:    bl __extendsfkf2
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    addis r5, r2, .LC8@toc@ha
-; CHECK-P8-NEXT:    ld r5, .LC8@toc@l(r5)
-; CHECK-P8-NEXT:    std r4, 8(r5)
-; CHECK-P8-NEXT:    std r3, 0(r5)
+; CHECK-P8-NEXT:    addis r3, r2, .LC8@toc@ha
+; CHECK-P8-NEXT:    ld r3, .LC8@toc@l(r3)
+; CHECK-P8-NEXT:    stvx v2, 0, r3
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    mtlr r0
@@ -1738,10 +1689,9 @@
 ; CHECK-P8-NEXT:    lfsx f1, r3, r4
 ; CHECK-P8-NEXT:    bl __extendsfkf2
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    addis r5, r2, .LC8@toc@ha
-; CHECK-P8-NEXT:    ld r5, .LC8@toc@l(r5)
-; CHECK-P8-NEXT:    std r4, 8(r5)
-; CHECK-P8-NEXT:    std r3, 0(r5)
+; CHECK-P8-NEXT:    addis r3, r2, .LC8@toc@ha
+; CHECK-P8-NEXT:    ld r3, .LC8@toc@l(r3)
+; CHECK-P8-NEXT:    stvx v2, 0, r3
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    mtlr r0
@@ -1780,9 +1730,8 @@
 ; CHECK-P8-NEXT:    mr r29, r3
 ; CHECK-P8-NEXT:    bl __extendsfkf2
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    sldi r5, r30, 4
-; CHECK-P8-NEXT:    stdux r3, r29, r5
-; CHECK-P8-NEXT:    std r4, 8(r29)
+; CHECK-P8-NEXT:    sldi r3, r30, 4
+; CHECK-P8-NEXT:    stvx v2, r29, r3
 ; CHECK-P8-NEXT:    addi r1, r1, 64
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1818,8 +1767,7 @@
 ; CHECK-P8-NEXT:    mr r30, r4
 ; CHECK-P8-NEXT:    bl __extendsfkf2
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1857,8 +1805,7 @@
 ; CHECK-P8-NEXT:    extsw r3, r3
 ; CHECK-P8-NEXT:    bl __floatsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1894,8 +1841,7 @@
 ; CHECK-P8-NEXT:    mffprd r3, f0
 ; CHECK-P8-NEXT:    bl __floatdikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1933,8 +1879,7 @@
 ; CHECK-P8-NEXT:    extsw r3, r3
 ; CHECK-P8-NEXT:    bl __floatsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -1970,8 +1915,7 @@
 ; CHECK-P8-NEXT:    mffprd r3, f0
 ; CHECK-P8-NEXT:    bl __floatdikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -2009,8 +1953,7 @@
 ; CHECK-P8-NEXT:    clrldi r3, r3, 32
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -2046,8 +1989,7 @@
 ; CHECK-P8-NEXT:    mffprd r3, f0
 ; CHECK-P8-NEXT:    bl __floatundikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -2085,8 +2027,7 @@
 ; CHECK-P8-NEXT:    clrldi r3, r3, 32
 ; CHECK-P8-NEXT:    bl __floatunsikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -2122,8 +2063,7 @@
 ; CHECK-P8-NEXT:    mffprd r3, f0
 ; CHECK-P8-NEXT:    bl __floatundikf
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r4, 8(r30)
-; CHECK-P8-NEXT:    std r3, 0(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -2160,9 +2100,7 @@
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r4, 8(r3)
-; CHECK-P8-NEXT:    mr r3, r5
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    bl __fixkfti
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
@@ -2199,9 +2137,7 @@
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r4, 8(r3)
-; CHECK-P8-NEXT:    mr r3, r5
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    bl __fixunskfti
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
@@ -2230,10 +2166,8 @@
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r4, 8(r3)
-; CHECK-P8-NEXT:    mr r3, r5
-; CHECK-P8-NEXT:    bl __fixunskfsi
+; CHECK-P8-NEXT:    lvx v2, 0, r3
+; CHECK-P8-NEXT:    bl __fixkfsi
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
@@ -2261,9 +2195,7 @@
 ; CHECK-P8-NEXT:    stdu r1, -32(r1)
 ; CHECK-P8-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-P8-NEXT:    .cfi_offset lr, 16
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r4, 8(r3)
-; CHECK-P8-NEXT:    mr r3, r5
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    bl __fixkfsi
 ; CHECK-P8-NEXT:    nop
 ; CHECK-P8-NEXT:    addi r1, r1, 32
diff --git a/llvm/test/CodeGen/PowerPC/f128-rounding.ll b/llvm/test/CodeGen/PowerPC/f128-rounding.ll
--- a/llvm/test/CodeGen/PowerPC/f128-rounding.ll
+++ b/llvm/test/CodeGen/PowerPC/f128-rounding.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs \
 ; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s
 ; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs \
-; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s | FileCheck %s \
+; RUN:   -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names < %s -enable-soft-fp128 | FileCheck %s \
 ; RUN:   -check-prefix=CHECK-P8
 
 define void @qp_trunc(fp128* nocapture readonly %a, fp128* nocapture %res) {
@@ -22,15 +22,11 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r6, 8(r3)
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    mr r30, r4
-; CHECK-P8-NEXT:    mr r3, r5
-; CHECK-P8-NEXT:    mr r4, r6
 ; CHECK-P8-NEXT:    bl truncf128
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r3, 0(r30)
-; CHECK-P8-NEXT:    std r4, 8(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -61,15 +57,11 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r6, 8(r3)
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    mr r30, r4
-; CHECK-P8-NEXT:    mr r3, r5
-; CHECK-P8-NEXT:    mr r4, r6
 ; CHECK-P8-NEXT:    bl rintf128
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r3, 0(r30)
-; CHECK-P8-NEXT:    std r4, 8(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -100,15 +92,11 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r6, 8(r3)
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    mr r30, r4
-; CHECK-P8-NEXT:    mr r3, r5
-; CHECK-P8-NEXT:    mr r4, r6
 ; CHECK-P8-NEXT:    bl nearbyintf128
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r3, 0(r30)
-; CHECK-P8-NEXT:    std r4, 8(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -139,15 +127,11 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r6, 8(r3)
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    mr r30, r4
-; CHECK-P8-NEXT:    mr r3, r5
-; CHECK-P8-NEXT:    mr r4, r6
 ; CHECK-P8-NEXT:    bl roundf128
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r3, 0(r30)
-; CHECK-P8-NEXT:    std r4, 8(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -178,15 +162,11 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r6, 8(r3)
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    mr r30, r4
-; CHECK-P8-NEXT:    mr r3, r5
-; CHECK-P8-NEXT:    mr r4, r6
 ; CHECK-P8-NEXT:    bl floorf128
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r3, 0(r30)
-; CHECK-P8-NEXT:    std r4, 8(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
@@ -217,15 +197,11 @@
 ; CHECK-P8-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
 ; CHECK-P8-NEXT:    std r0, 16(r1)
 ; CHECK-P8-NEXT:    stdu r1, -48(r1)
-; CHECK-P8-NEXT:    ld r5, 0(r3)
-; CHECK-P8-NEXT:    ld r6, 8(r3)
+; CHECK-P8-NEXT:    lvx v2, 0, r3
 ; CHECK-P8-NEXT:    mr r30, r4
-; CHECK-P8-NEXT:    mr r3, r5
-; CHECK-P8-NEXT:    mr r4, r6
 ; CHECK-P8-NEXT:    bl ceilf128
 ; CHECK-P8-NEXT:    nop
-; CHECK-P8-NEXT:    std r3, 0(r30)
-; CHECK-P8-NEXT:    std r4, 8(r30)
+; CHECK-P8-NEXT:    stvx v2, 0, r30
 ; CHECK-P8-NEXT:    addi r1, r1, 48
 ; CHECK-P8-NEXT:    ld r0, 16(r1)
 ; CHECK-P8-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll b/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll
--- a/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict-conv-f128.ll
@@ -6,7 +6,7 @@
 ; RUN:   < %s -mtriple=powerpc64le-unknown-linux -mcpu=pwr9 | FileCheck %s \
 ; RUN:   -check-prefix=P9
 ; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
-; RUN:   < %s -mtriple=powerpc64le-unknown-linux -mcpu=pwr8 -mattr=-vsx \
+; RUN:   < %s -mtriple=powerpc64le-unknown-linux -mcpu=pwr8 -enable-soft-fp128 -mattr=-vsx \
 ; RUN:   | FileCheck %s -check-prefix=NOVSX
 
 declare i1 @llvm.experimental.constrained.fptosi.i1.f128(fp128, metadata)
@@ -203,7 +203,7 @@
 ; NOVSX-NEXT:    stdu r1, -32(r1)
 ; NOVSX-NEXT:    .cfi_def_cfa_offset 32
 ; NOVSX-NEXT:    .cfi_offset lr, 16
-; NOVSX-NEXT:    bl __fixunskfsi
+; NOVSX-NEXT:    bl __fixkfsi
 ; NOVSX-NEXT:    nop
 ; NOVSX-NEXT:    addi r1, r1, 32
 ; NOVSX-NEXT:    ld r0, 16(r1)
@@ -806,7 +806,7 @@
 ; NOVSX-NEXT:    stdu r1, -32(r1)
 ; NOVSX-NEXT:    .cfi_def_cfa_offset 32
 ; NOVSX-NEXT:    .cfi_offset lr, 16
-; NOVSX-NEXT:    bl __floatunsikf
+; NOVSX-NEXT:    bl __floatsikf
 ; NOVSX-NEXT:    nop
 ; NOVSX-NEXT:    addi r1, r1, 32
 ; NOVSX-NEXT:    ld r0, 16(r1)
diff --git a/llvm/test/CodeGen/X86/fp128-load.ll b/llvm/test/CodeGen/X86/fp128-load.ll
--- a/llvm/test/CodeGen/X86/fp128-load.ll
+++ b/llvm/test/CodeGen/X86/fp128-load.ll
@@ -22,14 +22,9 @@
 define fp128 @TestLoadExtend(fp128 %x, i32 %n) {
 ; CHECK-LABEL: TestLoadExtend:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    movslq %edi, %rax
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    callq __extendsftf2
-; CHECK-NEXT:    popq %rax
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    jmp __extendsftf2@PLT # TAILCALL
 entry:
   %idxprom = sext i32 %n to i64
   %arrayidx = getelementptr inbounds [2 x float], [2 x float]* @TestLoadExtend.data, i64 0, i64 %idxprom