Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -97,7 +97,8 @@
     TypeLegal,           // The target natively supports this type.
     TypePromoteInteger,  // Replace this integer with a larger one.
     TypeExpandInteger,   // Split this integer into two of half the size.
-    TypeSoftenFloat,     // Convert this float to a same size integer type.
+    TypeSoftenFloat,     // Convert this float to a same size integer type,
+                         // if an operation is not supported in target HW.
     TypeExpandFloat,     // Split this float into two of half the size.
     TypeScalarizeVector, // Replace this one-element vector with its element.
     TypeSplitVector,     // Split this vector into two of half the size.
@@ -1911,6 +1912,7 @@
   /// up the MVT::LAST_VALUETYPE value to the next multiple of 8.
   uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE + 7) / 8];
 
+protected:
   ValueTypeActionImpl ValueTypeActions;
 
 private:
Index: include/llvm/Target/TargetRegisterInfo.h
===================================================================
--- include/llvm/Target/TargetRegisterInfo.h
+++ include/llvm/Target/TargetRegisterInfo.h
@@ -614,9 +614,13 @@
 
   /// Find the largest common subclass of A and B.
   /// Return NULL if there is no common subclass.
+  /// The common subclass should contain
+  /// simple value type SVT if it is not the Any type.
   const TargetRegisterClass *
   getCommonSubClass(const TargetRegisterClass *A,
-                    const TargetRegisterClass *B) const;
+                    const TargetRegisterClass *B,
+                    const MVT::SimpleValueType SVT =
+                    MVT::SimpleValueType::Any) const;
 
   /// Returns a TargetRegisterClass used for pointer values.
   /// If a target supports multiple different pointer register classes,
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8722,6 +8722,22 @@
                      ZeroCmp, Zero, RV);
 }
 
+static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
+  // copysign(x, fp_extend(y)) -> copysign(x, y)
+  // copysign(x, fp_round(y)) -> copysign(x, y)
+  // Do not optimize out type conversion of f128 type yet.
+  // For some target like x86_64, configuration is changed
+  // to keep one f128 value in one SSE register, but
+  // instruction selection cannot handle FCOPYSIGN on
+  // SSE registers yet.
+  SDValue N1 = N->getOperand(1);
+  EVT N1VT = N1->getValueType(0);
+  EVT N1Op0VT = N1->getOperand(0)->getValueType(0);
+  return (N1.getOpcode() == ISD::FP_EXTEND ||
+          N1.getOpcode() == ISD::FP_ROUND) &&
+         (N1VT == N1Op0VT || N1Op0VT != MVT::f128);
+}
+
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8765,7 +8781,7 @@
 
   // copysign(x, fp_extend(y)) -> copysign(x, y)
   // copysign(x, fp_round(y)) -> copysign(x, y)
-  if (N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND)
+  if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
                        N0, N1.getOperand(0));
 
Index: lib/CodeGen/SelectionDAG/InstrEmitter.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -139,7 +139,7 @@
               UseRC = RC;
             else if (RC) {
               const TargetRegisterClass *ComRC =
-                TRI->getCommonSubClass(UseRC, RC);
+                TRI->getCommonSubClass(UseRC, RC, VT.SimpleTy);
               // If multiple uses expect disjoint register classes, we emit
               // copies in AddRegisterOperand.
               if (ComRC)
Index: lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -154,6 +154,7 @@
   SDValue ExpandVectorBuildThroughStack(SDNode* Node);
 
   SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
+  SDValue ExpandConstant(ConstantSDNode *CP);
 
   // if ExpandNode returns false, LegalizeOp falls back to ConvertNodeToLibcall
   bool ExpandNode(SDNode *Node);
@@ -294,6 +295,20 @@
   return Result;
 }
 
+/// Expands the Constant node to a load from the constant pool.
+SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
+  SDLoc dl(CP);
+  EVT VT = CP->getValueType(0);
+  SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
+                                      TLI.getPointerTy(DAG.getDataLayout()));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Result =
+    DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+                false, false, false, Alignment);
+  return Result;
+}
+
 /// Expands an unaligned store to 2 half-size stores.
 static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                  const TargetLowering &TLI,
@@ -1192,15 +1207,17 @@
 
 #ifndef NDEBUG
   for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
-    assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
-             TargetLowering::TypeLegal &&
+    assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
+              TargetLowering::TypeLegal ||
+            TLI.isTypeLegal(Node->getValueType(i))) &&
            "Unexpected illegal type!");
 
   for (const SDValue &Op : Node->op_values())
-    assert((TLI.getTypeAction(*DAG.getContext(),
-                              Op.getValueType()) == TargetLowering::TypeLegal ||
-                              Op.getOpcode() == ISD::TargetConstant) &&
-                              "Unexpected illegal type!");
+    assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
+              TargetLowering::TypeLegal ||
+            TLI.isTypeLegal(Op.getValueType()) ||
+            Op.getOpcode() == ISD::TargetConstant) &&
+            "Unexpected illegal type!");
 #endif
 
   // Figure out the correct action; the way to query this varies by opcode
@@ -3390,6 +3407,11 @@
       Results.push_back(ExpandConstantFP(CFP, true));
     break;
   }
+  case ISD::Constant: {
+    ConstantSDNode *CP = cast<ConstantSDNode>(Node);
+    Results.push_back(ExpandConstant(CP));
+    break;
+  }
   case ISD::FSUB: {
     EVT VT = Node->getValueType(0);
     if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
Index: lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -43,10 +43,10 @@
 }
 
 //===----------------------------------------------------------------------===//
-//  Result Float to Integer Conversion.
+//  Convert Float Results to Integer for Non-HW-supported Operations.
 //===----------------------------------------------------------------------===//
 
-void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
         dbgs() << "\n");
   SDValue R = SDValue();
@@ -59,20 +59,26 @@
 #endif
     llvm_unreachable("Do not know how to soften the result of this operator!");
 
+    case ISD::Register:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+      assert(isLegalInHWReg(N->getValueType(ResNo)) &&
+             "Unsupported SoftenFloatRes opcode!");
+      // Only when isLegalInHWReg, we can skip check of the operands.
+      R = SDValue(N, ResNo);
+      break;
     case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
-    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N); break;
+    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N, ResNo); break;
     case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
-    case ISD::ConstantFP:
-      R = SoftenFloatRes_ConstantFP(cast<ConstantFPSDNode>(N));
-      break;
+    case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N, ResNo); break;
     case ISD::EXTRACT_VECTOR_ELT:
       R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
-    case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N, ResNo); break;
     case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
     case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
     case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
-    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N, ResNo); break;
     case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
     case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
     case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
@@ -84,7 +90,7 @@
     case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
     case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
     case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
-    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N, ResNo); break;
     case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
     case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
     case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
@@ -97,9 +103,9 @@
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
     case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
-    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
-    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
-    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N, ResNo); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N, ResNo); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N, ResNo); break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
@@ -107,11 +113,19 @@
   }
 
   // If R is null, the sub-method took care of registering the result.
-  if (R.getNode())
+  if (R.getNode()) {
     SetSoftenedFloat(SDValue(N, ResNo), R);
+    ReplaceSoftenFloatResult(N, ResNo, R);
+  }
+  // Return true only if the node is changed,
+  // assuming that the operands are also converted when necessary.
+  // Otherwise, return false to tell caller to scan operands.
+  return R.getNode() && R.getNode() != N;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   return BitConvertToInteger(N->getOperand(0));
 }
 
@@ -130,10 +144,14 @@
                      BitConvertToInteger(N->getOperand(1)));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(ConstantFPSDNode *N) {
-  return DAG.getConstant(N->getValueAPF().bitcastToAPInt(), SDLoc(N),
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, we can load better from the constant pool.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+  return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
                          TLI.getTypeToTransformTo(*DAG.getContext(),
-                                                  N->getValueType(0)));
+                                                  CN->getValueType(0)));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
@@ -143,7 +161,10 @@
                      NewOp, N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FABS can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned Size = NVT.getSizeInBits();
 
@@ -206,7 +227,10 @@
                          NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FCOPYSIGN can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(0));
   SDValue RHS = BitConvertToInteger(N->getOperand(1));
   SDLoc dl(N);
@@ -390,7 +414,10 @@
                          NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FNEG can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
   // Expand Y = FNEG(X) -> Y = SUB -0.0, X
@@ -580,7 +607,8 @@
                          NVT, Op, false, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
+  bool LegalInHWReg = isLegalInHWReg(N->getValueType(ResNo));
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
@@ -595,7 +623,8 @@
                        L->getAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
-    ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+    if (N != NewL.getValue(1).getNode())
+      ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
     return NewL;
   }
 
@@ -609,17 +638,24 @@
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
-  return BitConvertToInteger(DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL));
+  auto ExtendNode = DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL);
+  if (LegalInHWReg)
+    return ExtendNode;
+  return BitConvertToInteger(ExtendNode);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(1));
   SDValue RHS = GetSoftenedFloat(N->getOperand(2));
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
   SDValue LHS = GetSoftenedFloat(N->getOperand(2));
   SDValue RHS = GetSoftenedFloat(N->getOperand(3));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
@@ -645,7 +681,8 @@
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
-  ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
+  if (N != NewVAARG.getValue(1).getNode())
+    ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
   return NewVAARG;
 }
 
@@ -679,7 +716,7 @@
 
 
 //===----------------------------------------------------------------------===//
-//  Operand Float to Integer Conversion..
+//  Convert Float Operand to Integer for Non-HW-supported Operations.
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
@@ -689,6 +726,8 @@
 
   switch (N->getOpcode()) {
   default:
+    if (CanSkipSoftenFloatOperand(N, OpNo))
+      return false;
 #ifndef NDEBUG
     dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
@@ -704,14 +743,23 @@
   case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_UINT(N); break;
   case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
-  case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
+  case ISD::STORE:
+    Res = SoftenFloatOp_STORE(N, OpNo);
+    // Do not try to analyze or soften this node again if the value is
+    // or can be held in a register. In that case, Res.getNode() should
+    // be equal to N.
+    if (Res.getNode() == N &&
+        isLegalInHWReg(N->getOperand(OpNo).getValueType()))
+      return false;
+    // Otherwise, we need to reanalyze and lower the new Res nodes.
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
   if (!Res.getNode()) return false;
 
   // If the result is N, the sub-method updated N in place.  Tell the legalizer
-  // core about this.
+  // core about this to re-analyze.
   if (Res.getNode() == N)
     return true;
 
@@ -722,6 +770,41 @@
   return false;
 }
 
+bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
+  if (!isLegalInHWReg(N->getOperand(OpNo).getValueType()))
+    return false;
+  // When the operand type can be kept in registers, SoftenFloatResult
+  // will call ReplaceValueWith to replace all references and we can
+  // skip softening this operand.
+  switch (N->getOperand(OpNo).getOpcode()) {
+    case ISD::BITCAST:
+    case ISD::ConstantFP:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+    case ISD::FABS:
+    case ISD::FCOPYSIGN:
+    case ISD::FNEG:
+    case ISD::Register:
+    case ISD::SELECT:
+    case ISD::SELECT_CC:
+      return true;
+  }
+  // For some opcodes, SoftenFloatResult handles all conversion of softening
+  // and replacing operands, so that there is no need to soften operands
+  // again, although such opcode could be scanned for other illegal operands.
+  switch (N->getOpcode()) {
+    case ISD::ConstantFP:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+    case ISD::FABS:
+    case ISD::FCOPYSIGN:
+    case ISD::FNEG:
+    case ISD::Register:
+      return true;
+  }
+  return false;
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
   return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
                      GetSoftenedFloat(N->getOperand(0)));
Index: lib/CodeGen/SelectionDAG/LegalizeTypes.h
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -72,6 +72,20 @@
     return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
   }
 
+  /// isSimpleLegalType - Return true if this is a simple legal type.
+  bool isSimpleLegalType(EVT VT) const {
+    return VT.isSimple() && TLI.isTypeLegal(VT);
+  }
+
+  /// isLegalInHWReg - Return true if this type can be passed in registers.
+  /// For example, x86_64's f128, should to be legally in registers
+  /// and only some operations converted to library calls or integer
+  /// bitwise operations.
+  bool isLegalInHWReg(EVT VT) const {
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+    return VT == NVT && isSimpleLegalType(VT);
+  }
+
   EVT getSetCCResultType(EVT VT) const {
     return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   }
@@ -372,32 +386,48 @@
   // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// GetSoftenedFloat - Given a processed operand Op which was converted to an
-  /// integer of the same size, this returns the integer.  The integer contains
-  /// exactly the same bits as Op - only the type changed.  For example, if Op
-  /// is an f32 which was softened to an i32, then this method returns an i32,
-  /// the bits of which coincide with those of Op.
+  /// GetSoftenedFloat - Given an operand Op of Float type, returns the integer
+  /// if the Op is not supported in target HW and converted to the integer.
+  /// The integer contains exactly the same bits as Op - only the type changed.
+  /// For example, if Op is an f32 which was softened to an i32, then this method
+  /// returns an i32, the bits of which coincide with those of Op.
+  /// If the Op can be efficiently supported in target HW or the operand must
+  /// stay in a register, the Op is not converted to an integer.
+  /// In that case, the given op is returned.
   SDValue GetSoftenedFloat(SDValue Op) {
     SDValue &SoftenedOp = SoftenedFloats[Op];
+    if (!SoftenedOp.getNode() &&
+        isSimpleLegalType(Op.getValueType()))
+      return Op;
     RemapValue(SoftenedOp);
     assert(SoftenedOp.getNode() && "Operand wasn't converted to integer?");
     return SoftenedOp;
   }
   void SetSoftenedFloat(SDValue Op, SDValue Result);
 
-  // Result Float to Integer Conversion.
-  void SoftenFloatResult(SDNode *N, unsigned OpNo);
+  // Call ReplaceValueWith(SDValue(N, ResNo), Res) if necessary.
+  void ReplaceSoftenFloatResult(SDNode *N, unsigned ResNo, SDValue &NewRes) {
+    // When the result type can be kept in HW registers, the converted
+    // NewRes node could have the same type. We can save the effort in
+    // cloning every user of N in SoftenFloatOperand or other legalization functions,
+    // by calling ReplaceValueWith here to update all users.
+    if (NewRes.getNode() != N && isLegalInHWReg(N->getValueType(ResNo)))
+      ReplaceValueWith(SDValue(N, ResNo), NewRes);
+  }
+
+  // Convert Float Results to Integer for Non-HW-supported Operations.
+  bool SoftenFloatResult(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_BITCAST(SDNode *N);
+  SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
-  SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N);
+  SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
-  SDValue SoftenFloatRes_FABS(SDNode *N);
+  SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FMINNUM(SDNode *N);
   SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
   SDValue SoftenFloatRes_FADD(SDNode *N);
   SDValue SoftenFloatRes_FCEIL(SDNode *N);
-  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FCOS(SDNode *N);
   SDValue SoftenFloatRes_FDIV(SDNode *N);
   SDValue SoftenFloatRes_FEXP(SDNode *N);
@@ -409,7 +439,7 @@
   SDValue SoftenFloatRes_FMA(SDNode *N);
   SDValue SoftenFloatRes_FMUL(SDNode *N);
   SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
-  SDValue SoftenFloatRes_FNEG(SDNode *N);
+  SDValue SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N);
   SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
@@ -422,14 +452,19 @@
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
   SDValue SoftenFloatRes_FTRUNC(SDNode *N);
-  SDValue SoftenFloatRes_LOAD(SDNode *N);
-  SDValue SoftenFloatRes_SELECT(SDNode *N);
-  SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
   SDValue SoftenFloatRes_VAARG(SDNode *N);
   SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
 
-  // Operand Float to Integer Conversion.
+  // Return true if we can skip softening the given operand or SDNode because
+  // it was soften before by SoftenFloatResult and references to the operand
+  // were replaced by ReplaceValueWith.
+  bool CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo);
+
+  // Convert Float Operand to Integer for Non-HW-supported Operations.
   bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_BITCAST(SDNode *N);
   SDValue SoftenFloatOp_BR_CC(SDNode *N);
Index: lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -238,9 +238,13 @@
         Changed = true;
         goto NodeDone;
       case TargetLowering::TypeSoftenFloat:
-        SoftenFloatResult(N, i);
-        Changed = true;
-        goto NodeDone;
+        Changed = SoftenFloatResult(N, i);
+        if (Changed)
+          goto NodeDone;
+        // If not changed, the result type should be legally in register.
+        assert(isLegalInHWReg(ResultVT) &&
+               "Unchanged SoftenFloatResult should be legal in register!");
+        goto ScanOperands;
       case TargetLowering::TypeExpandFloat:
         ExpandFloatResult(N, i);
         Changed = true;
@@ -411,18 +415,27 @@
     bool Failed = false;
 
     // Check that all result types are legal.
+    // A value type is illegal if its TypeAction is not TypeLegal,
+    // and TLI.RegClassForVT does not have a register class for this type.
+    // For example, the x86_64 target has f128 that is not TypeLegal,
+    // to have softened operators, but it also has FR128 register class to
+    // pass and return f128 values. Hence a legalized node can have f128 type.
     if (!IgnoreNodeResults(&Node))
       for (unsigned i = 0, NumVals = Node.getNumValues(); i < NumVals; ++i)
-        if (!isTypeLegal(Node.getValueType(i))) {
-          dbgs() << "Result type " << i << " illegal!\n";
+        if (!isTypeLegal(Node.getValueType(i)) &&
+            !TLI.isTypeLegal(Node.getValueType(i))) {
+          dbgs() << "Result type " << i << " illegal: ";
+          Node.dump();
           Failed = true;
         }
 
     // Check that all operand types are legal.
     for (unsigned i = 0, NumOps = Node.getNumOperands(); i < NumOps; ++i)
       if (!IgnoreNodeResults(Node.getOperand(i).getNode()) &&
-          !isTypeLegal(Node.getOperand(i).getValueType())) {
-        dbgs() << "Operand type " << i << " illegal!\n";
+          !isTypeLegal(Node.getOperand(i).getValueType()) &&
+          !TLI.isTypeLegal(Node.getOperand(i).getValueType())) {
+        dbgs() << "Operand type " << i << " illegal: ";
+        Node.getOperand(i).dump();
         Failed = true;
       }
 
@@ -748,13 +761,23 @@
 }
 
 void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
-  assert(Result.getValueType() ==
-         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+  // f128 of x86_64 could be kept in SSE registers,
+  // but sometimes softened to i128.
+  assert((Result.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) ||
+          Op.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
          "Invalid type for softened float");
   AnalyzeNewValue(Result);
 
   SDValue &OpEntry = SoftenedFloats[Op];
-  assert(!OpEntry.getNode() && "Node is already converted to integer!");
+  // Allow repeated calls to save f128 type nodes
+  // or any node with type that transforms to itself.
+  // Many operations on these types are not softened.
+  assert((!OpEntry.getNode()||
+          Op.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
+         "Node is already converted to integer!");
   OpEntry = Result;
 }
 
Index: lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -53,12 +53,17 @@
     case TargetLowering::TypePromoteFloat:
       llvm_unreachable("Bitcast of a promotion-needing float should never need"
                        "expansion");
-    case TargetLowering::TypeSoftenFloat:
-      // Convert the integer operand instead.
-      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
+    case TargetLowering::TypeSoftenFloat: {
+      // Expand the floating point operand only if it was converted to integers.
+      // Otherwise, it is a legal type like f128 that can be saved in a register.
+      auto SoftenedOp = GetSoftenedFloat(InOp);
+      if (SoftenedOp == InOp)
+        break;
+      SplitInteger(SoftenedOp, Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
+    }
     case TargetLowering::TypeExpandInteger:
     case TargetLowering::TypeExpandFloat: {
       auto &DL = DAG.getDataLayout();
Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2893,8 +2893,10 @@
         return getConstantFP(APFloat(APFloat::IEEEhalf, Val), DL, VT);
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
         return getConstantFP(APFloat(APFloat::IEEEsingle, Val), DL, VT);
-      else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
+      if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
         return getConstantFP(APFloat(APFloat::IEEEdouble, Val), DL, VT);
+      if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
+        return getConstantFP(APFloat(APFloat::IEEEquad, Val), DL, VT);
       break;
     case ISD::BSWAP:
       return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -626,7 +626,10 @@
 
 static bool printOperand(raw_ostream &OS, const SelectionDAG *G,
                          const SDValue Value) {
-  if (shouldPrintInline(*Value.getNode())) {
+  if (!Value.getNode()) {
+    OS << "<null>";
+    return false;
+  } else if (shouldPrintInline(*Value.getNode())) {
     OS << Value->getOperationName(G) << ':';
     Value->print_types(OS, G);
     Value->print_details(OS, G);
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1072,7 +1072,9 @@
         Op.getOperand(0).getValueType().isFloatingPoint()) {
       bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType());
       bool i32Legal  = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
-      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple()) {
+      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple() &&
+           Op.getOperand(0).getValueType() != MVT::f128) {
+        // Cannot eliminate/lower SHL for f128 yet.
         EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32;
         // Make a FGETSIGN + SHL to move the sign bit into the appropriate
         // place.  We expect the SHL to be eliminated by other optimizations.
Index: lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- lib/CodeGen/TargetLoweringBase.cpp
+++ lib/CodeGen/TargetLoweringBase.cpp
@@ -1654,6 +1654,10 @@
     if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger)
       Cost *= 2;
 
+    // Do not loop with f128 type.
+    if (MTy == LK.second)
+      return std::make_pair(Cost, MTy.getSimpleVT());
+
     // Keep legalizing the type.
     MTy = LK.second;
   }
Index: lib/CodeGen/TargetRegisterInfo.cpp
===================================================================
--- lib/CodeGen/TargetRegisterInfo.cpp
+++ lib/CodeGen/TargetRegisterInfo.cpp
@@ -171,16 +171,24 @@
 static inline
 const TargetRegisterClass *firstCommonClass(const uint32_t *A,
                                             const uint32_t *B,
-                                            const TargetRegisterInfo *TRI) {
+                                            const TargetRegisterInfo *TRI,
+                                            const MVT::SimpleValueType SVT =
+                                            MVT::SimpleValueType::Any) {
+  const MVT VT(SVT);
   for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
-    if (unsigned Common = *A++ & *B++)
-      return TRI->getRegClass(I + countTrailingZeros(Common));
+    if (unsigned Common = *A++ & *B++) {
+      const TargetRegisterClass *RC =
+          TRI->getRegClass(I + countTrailingZeros(Common));
+      if (SVT == MVT::SimpleValueType::Any || RC->hasType(VT))
+        return RC;
+    }
   return nullptr;
 }
 
 const TargetRegisterClass *
 TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
-                                      const TargetRegisterClass *B) const {
+                                      const TargetRegisterClass *B,
+                                      const MVT::SimpleValueType SVT) const {
   // First take care of the trivial cases.
   if (A == B)
     return A;
@@ -189,7 +197,7 @@
 
   // Register classes are ordered topologically, so the largest common
   // sub-class it the common sub-class with the smallest ID.
-  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this);
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this, SVT);
 }
 
 const TargetRegisterClass *
Index: lib/Target/X86/X86CallingConv.td
===================================================================
--- lib/Target/X86/X86CallingConv.td
+++ lib/Target/X86/X86CallingConv.td
@@ -158,6 +158,7 @@
   // The X86-64 calling convention always returns FP values in XMM0.
   CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
 
   // MMX vector types are always returned in XMM0.
   CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
@@ -293,7 +294,7 @@
   CCIfType<[v64i1], CCPromoteToType<v64i8>>,
 
   // The first 8 FP/Vector arguments are passed in XMM registers.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
             CCIfSubtarget<"hasSSE1()",
             CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
 
@@ -318,7 +319,7 @@
 
   // Long doubles get stack slots whose size and alignment depends on the
   // subtarget.
-  CCIfType<[f80], CCAssignToStack<0, 0>>,
+  CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
 
   // Vectors get 16-byte stack slots that are 16-byte aligned.
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -296,6 +296,7 @@
   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
+  setOperationAction(ISD::BR_CC            , MVT::f128,  Expand);
   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
@@ -303,6 +304,7 @@
   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::f128,  Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
@@ -415,12 +417,14 @@
   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f128 , Custom);
   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f128 , Custom);
   setOperationAction(ISD::SETCCE          , MVT::i8   , Custom);
   setOperationAction(ISD::SETCCE          , MVT::i16  , Custom);
   setOperationAction(ISD::SETCCE          , MVT::i32  , Custom);
@@ -619,8 +623,16 @@
   setOperationAction(ISD::FMA, MVT::f64, Expand);
   setOperationAction(ISD::FMA, MVT::f32, Expand);
 
-  // Long double always uses X87.
+  // Long double always uses X87, except f128 in MMX.
   if (!Subtarget->useSoftFloat()) {
+    if (Subtarget->is64Bit() && Subtarget->hasMMX()) {
+      addRegisterClass(MVT::f128, &X86::FR128RegClass);
+      ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+      setOperationAction(ISD::FABS , MVT::f128, Custom);
+      setOperationAction(ISD::FNEG , MVT::f128, Custom);
+      setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
+    }
+
     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
@@ -2353,7 +2365,7 @@
     EVT CopyVT = VA.getLocVT();
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
-    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
       report_fatal_error("SSE register return with SSE disabled");
     }
@@ -2637,6 +2649,8 @@
         RC = &X86::FR32RegClass;
       else if (RegVT == MVT::f64)
         RC = &X86::FR64RegClass;
+      else if (RegVT == MVT::f128)
+        RC = &X86::FR128RegClass;
       else if (RegVT.is512BitVector())
         RC = &X86::VR512RegClass;
       else if (RegVT.is256BitVector())
@@ -13385,6 +13399,8 @@
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
 
+  bool IsF128 = (VT == MVT::f128);
+
   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
   // decide if we should generate a 16-byte constant mask when we only need 4 or
   // 8 bytes for the scalar case.
@@ -13397,6 +13413,11 @@
     LogicVT = VT;
     EltVT = VT.getVectorElementType();
     NumElts = VT.getVectorNumElements();
+  } else if (IsF128) {
+    // SSE instructions are used for optimized f128 logical operations.
+    LogicVT = MVT::f128;
+    EltVT = VT;
+    NumElts = 1;
   } else {
     // There are no scalar bitwise logical SSE/AVX instructions, so we
     // generate a 16-byte vector constant and logic op even for the scalar case.
@@ -13428,7 +13449,7 @@
     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
 
-  if (VT.isVector())
+  if (VT.isVector() || IsF128)
     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
 
   // For the scalar case extend to a 128-bit vector, perform the logic op,
@@ -13447,6 +13468,7 @@
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   MVT SrcVT = Op1.getSimpleValueType();
+  bool IsF128 = (VT == MVT::f128);
 
   // If second operand is smaller, extend it first.
   if (SrcVT.bitsLT(VT)) {
@@ -13461,13 +13483,16 @@
 
   // At this point the operands and the result should have the same
   // type, and that won't be f80 since that is not custom lowered.
+  assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) &&
+         "Unexpected type in LowerFCOPYSIGN");
 
   const fltSemantics &Sem =
-      VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle;
+      VT == MVT::f64 ? APFloat::IEEEdouble :
+          (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle);
   const unsigned SizeInBits = VT.getSizeInBits();
 
   SmallVector<Constant *, 4> CV(
-      VT == MVT::f64 ? 2 : 4,
+      VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4),
       ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0))));
 
   // First, clear all bits but the sign bit from the second operand (sign).
@@ -13480,12 +13505,13 @@
   // Perform all logic operations as 16-byte vectors because there are no
   // scalar FP logic instructions in SSE. This allows load folding of the
   // constants into the logic instructions.
-  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+  MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32);
   SDValue Mask1 =
       DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx,
                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
                   false, false, false, 16);
-  Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
+  if (!IsF128)
+    Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1);
   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1);
 
   // Next, clear the sign bit from the first operand (magnitude).
@@ -13494,8 +13520,9 @@
     APFloat APF = Op0CN->getValueAPF();
     // If the magnitude is a positive zero, the sign bit alone is enough.
     if (APF.isPosZero())
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
-                         DAG.getIntPtrConstant(0, dl));
+      return IsF128 ? SignBit :
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit,
+                      DAG.getIntPtrConstant(0, dl));
     APF.clearSign();
     CV[0] = ConstantFP::get(*Context, APF);
   } else {
@@ -13511,13 +13538,15 @@
                   false, false, false, 16);
   // If the magnitude operand wasn't a constant, we need to AND out the sign.
   if (!isa<ConstantFPSDNode>(Op0)) {
-    Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
+    if (!IsF128)
+      Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0);
     Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val);
   }
   // OR the magnitude value with the sign bit.
   Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
-                     DAG.getIntPtrConstant(0, dl));
+  return IsF128 ? Val :
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val,
+                  DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
@@ -14593,6 +14622,7 @@
   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
   // these.
   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
+      Op1.getValueType() != MVT::i128 &&  // getZExtValue() works up to i64 only.
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
 
     // If the input is a setcc, then reuse the input setcc or use a new one with
@@ -21998,6 +22028,7 @@
     return EmitLoweredTLSCall(MI, BB);
   case X86::CMOV_FR32:
   case X86::CMOV_FR64:
+  case X86::CMOV_FR128:
   case X86::CMOV_GR8:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
@@ -23661,7 +23692,8 @@
   // ignored in unsafe-math mode).
   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
+      VT != MVT::f80 && VT != MVT::f128 &&
+      (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
       (Subtarget->hasSSE2() ||
        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -27749,6 +27781,7 @@
       case MVT::f64:
       case MVT::i64:
         return std::make_pair(0U, &X86::FR64RegClass);
+      // TODO: handle f128 and i128 in FR128RegClass.
       // Vector types.
       case MVT::v16i8:
       case MVT::v8i16:
@@ -27861,6 +27894,7 @@
     // target independent register mapper will just pick the first match it can
     // find, ignoring the required type.
 
+    // TODO: handle f128 and i128 in FR128RegClass.
     if (VT == MVT::f32 || VT == MVT::i32)
       Res.second = &X86::FR32RegClass;
     else if (VT == MVT::f64 || VT == MVT::i64)
Index: lib/Target/X86/X86InstrCompiler.td
===================================================================
--- lib/Target/X86/X86InstrCompiler.td
+++ lib/Target/X86/X86InstrCompiler.td
@@ -512,6 +512,7 @@
 
   defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
   defm _FR64   : CMOVrr_PSEUDO<FR64, f64>;
+  defm _FR128  : CMOVrr_PSEUDO<FR128, f128>;
   defm _V4F32  : CMOVrr_PSEUDO<VR128, v4f32>;
   defm _V2F64  : CMOVrr_PSEUDO<VR128, v2f64>;
   defm _V2I64  : CMOVrr_PSEUDO<VR128, v2i64>;
Index: lib/Target/X86/X86InstrInfo.td
===================================================================
--- lib/Target/X86/X86InstrInfo.td
+++ lib/Target/X86/X86InstrInfo.td
@@ -953,11 +953,12 @@
   return false;
 }]>;
 
-def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
-def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
-def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
-def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
-def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadi8   : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
+def loadi64  : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf32  : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64  : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+def loadf80  : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
+def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>;
 
 def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
 def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -413,6 +413,8 @@
   def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
   def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
   def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(f128  (bitconvert (i128  FR128:$src))), (f128  FR128:$src)>;
+  def : Pat<(i128  (bitconvert (f128  FR128:$src))), (i128  FR128:$src)>;
 }
 
 // Bitcasts between 256-bit vector types. Return the original type since
@@ -8851,3 +8853,48 @@
     defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>;
   }
 }
+
+//===----------------------------------------------------------------------===//
+// Extra selection patterns for FR128, f128, f128mem
+
+def : Pat<(store (f128 FR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;
+// When the data is used as floating point, "movaps" should be faster and shorter
+// than "movdqa". "movaps" is in SSE and movdqa is in SSE2.
+
+def : Pat<(loadf128 addr:$src),
+          (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;
+
+// andps is faster and shorter than andpd, andps is SSE and andpd is SSE2
+def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), FR128)>;
+
+def : Pat<(X86fand FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                                     (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(and FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                                     (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), FR128)>;
+
+def : Pat<(X86for FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(or FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                                    (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
+          (COPY_TO_REGCLASS (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), FR128)>;
+
+def : Pat<(X86fxor FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                                     (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+def : Pat<(xor FR128:$src1, FR128:$src2),
+          (COPY_TO_REGCLASS (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
+                                     (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
Index: lib/Target/X86/X86MCInstLower.cpp
===================================================================
--- lib/Target/X86/X86MCInstLower.cpp
+++ lib/Target/X86/X86MCInstLower.cpp
@@ -1373,7 +1373,19 @@
           if (isa<UndefValue>(COp)) {
             CS << "u";
           } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
-            CS << CI->getZExtValue();
+            if (CI->getBitWidth() <= 64) {
+              CS << CI->getZExtValue();
+            } else {
+              // print multi-word constant as (w0,w1)
+              auto Val = CI->getValue();
+              CS << "(";
+              for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
+                if (i > 0)
+                  CS << ",";
+                CS << Val.getRawData()[i];
+              }
+              CS << ")";
+            }
           } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
             SmallString<32> Str;
             CF->getValueAPF().toString(Str);
Index: lib/Target/X86/X86RegisterInfo.td
===================================================================
--- lib/Target/X86/X86RegisterInfo.td
+++ lib/Target/X86/X86RegisterInfo.td
@@ -423,6 +423,8 @@
 
 def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>;
 
+def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>;
+
 
 // FIXME: This sets up the floating point register files as though they are f64
 // values, though they really are f80 values.  This will cause us to spill
Index: test/CodeGen/X86/fp128-calling-conv.ll
===================================================================
--- test/CodeGen/X86/fp128-calling-conv.ll
+++ test/CodeGen/X86/fp128-calling-conv.ll
@@ -0,0 +1,197 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; double myD = 1.0;
+@myD = global double 1.000000e+00, align 8
+
+; long double myFP80 = 1.0L;  // x86_64-linux-gnu
+@myFP80 = global x86_fp80 0xK3FFF8000000000000000, align 16
+
+; long double myFP128 = 1.0L;  // x86_64-linux-android
+@myFP128 = global fp128 0xL00000000000000003FFF000000000000, align 16
+
+; The first few parameters are passed in registers and the other are on stack.
+
+define i64 @TestParam_L_0(i64 %d0, i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, i64 %d10, i64 %d11, i64 %d12, i64 %d13, i64 %d14, i64 %d15, i64 %d16, i64 %d17, i64 %d18, i64 %d19) {
+entry:
+  ret i64 %d0
+; CHECK-LABEL: TestParam_L_0:
+; CHECK:       movq %rdi, %rax
+; CHECK-NEXT:  retq
+}
+
+define i64 @TestParam_L_1(i64 %d0, i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, i64 %d10, i64 %d11, i64 %d12, i64 %d13, i64 %d14, i64 %d15, i64 %d16, i64 %d17, i64 %d18, i64 %d19) {
+entry:
+  ret i64 %d1
+; CHECK-LABEL: TestParam_L_1:
+; CHECK:       movq %rsi, %rax
+; CHECK-NEXT:  retq
+}
+
+define i64 @TestParam_L_2(i64 %d0, i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, i64 %d10, i64 %d11, i64 %d12, i64 %d13, i64 %d14, i64 %d15, i64 %d16, i64 %d17, i64 %d18, i64 %d19) {
+entry:
+  ret i64 %d2
+; CHECK-LABEL: TestParam_L_2:
+; CHECK:       movq %rdx, %rax
+; CHECK-NEXT:  retq
+}
+
+define i64 @TestParam_L_3(i64 %d0, i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, i64 %d10, i64 %d11, i64 %d12, i64 %d13, i64 %d14, i64 %d15, i64 %d16, i64 %d17, i64 %d18, i64 %d19) {
+entry:
+  ret i64 %d3
+; CHECK-LABEL: TestParam_L_3:
+; CHECK:       movq %rcx, %rax
+; CHECK-NEXT:  retq
+}
+
+define i64 @TestParam_L_4(i64 %d0, i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, i64 %d10, i64 %d11, i64 %d12, i64 %d13, i64 %d14, i64 %d15, i64 %d16, i64 %d17, i64 %d18, i64 %d19) {
+entry:
+  ret i64 %d4
+; CHECK-LABEL: TestParam_L_4:
+; CHECK:       movq %r8, %rax
+; CHECK-NEXT:  retq
+}
+
+define i64 @TestParam_L_5(i64 %d0, i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, i64 %d10, i64 %d11, i64 %d12, i64 %d13, i64 %d14, i64 %d15, i64 %d16, i64 %d17, i64 %d18, i64 %d19) {
+entry:
+  ret i64 %d5
+; CHECK-LABEL: TestParam_L_5:
+; CHECK:       movq %r9, %rax
+; CHECK-NEXT:  retq
+}
+
+define i64 @TestParam_L_6(i64 %d0, i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, i64 %d10, i64 %d11, i64 %d12, i64 %d13, i64 %d14, i64 %d15, i64 %d16, i64 %d17, i64 %d18, i64 %d19) {
+entry:
+  ret i64 %d6
+; CHECK-LABEL: TestParam_L_6:
+; CHECK:       movq 8(%rsp), %rax
+; CHECK-NEXT:  retq
+}
+
+define i64 @TestParam_L_7(i64 %d0, i64 %d1, i64 %d2, i64 %d3, i64 %d4, i64 %d5, i64 %d6, i64 %d7, i64 %d8, i64 %d9, i64 %d10, i64 %d11, i64 %d12, i64 %d13, i64 %d14, i64 %d15, i64 %d16, i64 %d17, i64 %d18, i64 %d19) {
+entry:
+  ret i64 %d7
+; CHECK-LABEL: TestParam_L_7:
+; CHECK:       movq 16(%rsp), %rax
+; CHECK-NEXT:  retq
+}
+
+define float @TestParam_F_0(float %d0, float %d1, float %d2, float %d3, float %d4, float %d5, float %d6, float %d7, float %d8, float %d9, float %d10, float %d11, float %d12, float %d13, float %d14, float %d15, float %d16, float %d17, float %d18, float %d19) {
+entry:
+  ret float %d0
+; CHECK-LABEL: TestParam_F_0:
+; CHECK-NOT:   mov
+; CHECK:       retq
+}
+
+define float @TestParam_F_1(float %d0, float %d1, float %d2, float %d3, float %d4, float %d5, float %d6, float %d7, float %d8, float %d9, float %d10, float %d11, float %d12, float %d13, float %d14, float %d15, float %d16, float %d17, float %d18, float %d19) {
+entry:
+  ret float %d1
+; CHECK-LABEL: TestParam_F_1:
+; CHECK:       movaps  %xmm1, %xmm0
+; CHECK-NEXT:  retq
+}
+
+define float @TestParam_F_7(float %d0, float %d1, float %d2, float %d3, float %d4, float %d5, float %d6, float %d7, float %d8, float %d9, float %d10, float %d11, float %d12, float %d13, float %d14, float %d15, float %d16, float %d17, float %d18, float %d19) {
+entry:
+  ret float %d7
+; CHECK-LABEL: TestParam_F_7:
+; CHECK:       movaps  %xmm7, %xmm0
+; CHECK-NEXT:  retq
+}
+
+define float @TestParam_F_8(float %d0, float %d1, float %d2, float %d3, float %d4, float %d5, float %d6, float %d7, float %d8, float %d9, float %d10, float %d11, float %d12, float %d13, float %d14, float %d15, float %d16, float %d17, float %d18, float %d19) {
+entry:
+  ret float %d8
+; CHECK-LABEL: TestParam_F_8:
+; CHECK:       movss 8(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
+
+define float @TestParam_F_9(float %d0, float %d1, float %d2, float %d3, float %d4, float %d5, float %d6, float %d7, float %d8, float %d9, float %d10, float %d11, float %d12, float %d13, float %d14, float %d15, float %d16, float %d17, float %d18, float %d19) {
+entry:
+  ret float %d9
+; CHECK-LABEL: TestParam_F_9:
+; CHECK:       movss 16(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
+
+define double @TestParam_D_0(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13, double %d14, double %d15, double %d16, double %d17, double %d18, double %d19) {
+entry:
+  ret double %d0
+; CHECK-LABEL: TestParam_D_0:
+; CHECK-NOT:   mov
+; CHECK:       retq
+}
+
+define double @TestParam_D_1(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13, double %d14, double %d15, double %d16, double %d17, double %d18, double %d19) {
+entry:
+  ret double %d1
+; CHECK-LABEL: TestParam_D_1:
+; CHECK:       movaps  %xmm1, %xmm0
+; CHECK-NEXT:  retq
+}
+
+define double @TestParam_D_7(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13, double %d14, double %d15, double %d16, double %d17, double %d18, double %d19) {
+entry:
+  ret double %d7
+; CHECK-LABEL: TestParam_D_7:
+; CHECK:       movaps  %xmm7, %xmm0
+; CHECK-NEXT:  retq
+}
+
+define double @TestParam_D_8(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13, double %d14, double %d15, double %d16, double %d17, double %d18, double %d19) {
+entry:
+  ret double %d8
+; CHECK-LABEL: TestParam_D_8:
+; CHECK:       movsd 8(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
+
+define double @TestParam_D_9(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13, double %d14, double %d15, double %d16, double %d17, double %d18, double %d19) {
+entry:
+  ret double %d9
+; CHECK-LABEL: TestParam_D_9:
+; CHECK:       movsd 16(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
+
+define fp128 @TestParam_FP128_0(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) {
+entry:
+  ret fp128 %d0
+; CHECK-LABEL: TestParam_FP128_0:
+; CHECK-NOT:   mov
+; CHECK:       retq
+}
+
+define fp128 @TestParam_FP128_1(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) {
+entry:
+  ret fp128 %d1
+; CHECK-LABEL: TestParam_FP128_1:
+; CHECK:       movaps  %xmm1, %xmm0
+; CHECK-NEXT:  retq
+}
+
+define fp128 @TestParam_FP128_7(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) {
+entry:
+  ret fp128 %d7
+; CHECK-LABEL: TestParam_FP128_7:
+; CHECK:       movaps  %xmm7, %xmm0
+; CHECK-NEXT:  retq
+}
+
+define fp128 @TestParam_FP128_8(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) {
+entry:
+  ret fp128 %d8
+; CHECK-LABEL: TestParam_FP128_8:
+; CHECK:       movaps 8(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
+
+define fp128 @TestParam_FP128_9(fp128 %d0, fp128 %d1, fp128 %d2, fp128 %d3, fp128 %d4, fp128 %d5, fp128 %d6, fp128 %d7, fp128 %d8, fp128 %d9, fp128 %d10, fp128 %d11, fp128 %d12, fp128 %d13, fp128 %d14, fp128 %d15, fp128 %d16, fp128 %d17, fp128 %d18, fp128 %d19) {
+entry:
+  ret fp128 %d9
+; CHECK-LABEL: TestParam_FP128_9:
+; CHECK:       movaps 24(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
Index: test/CodeGen/X86/fp128-cast.ll
===================================================================
--- test/CodeGen/X86/fp128-cast.ll
+++ test/CodeGen/X86/fp128-cast.ll
@@ -0,0 +1,372 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; Check soft floating point conversion function calls.
+
+@vi32 = common global i32 0, align 4
+@vi64 = common global i64 0, align 8
+@vf32 = common global float 0.000000e+00, align 4
+@vf64 = common global double 0.000000e+00, align 8
+@vf128 = common global fp128 0xL00000000000000000000000000000000, align 16
+
+define void @TestCastF32_I32() {
+entry:
+  %0 = load float, float* @vf32, align 4
+  %conv = fptosi float %0 to i32
+  store i32 %conv, i32* @vi32, align 4
+  ret void
+; CHECK-LABEL: TestCastF32_I32:
+; CHECK:       cvttss2si  vf32(%rip), %eax
+; CHECK-NEXT:  movl       %eax, vi32(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastF32_I64() {
+entry:
+  %0 = load float, float* @vf32, align 4
+  %conv = fptosi float %0 to i32
+  %conv1 = sext i32 %conv to i64
+  store i64 %conv1, i64* @vi64, align 8
+  ret void
+; CHECK-LABEL: TestCastF32_I64:
+; CHECK:       cvttss2si  vf32(%rip), %eax
+; CHECK-NEXT:  cltq
+; CHECK-NEXT:  movq       %rax, vi64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastF32_F64() {
+entry:
+  %0 = load float, float* @vf32, align 4
+  %conv = fpext float %0 to double
+  store double %conv, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: TestCastF32_F64:
+; CHECK:       movss      vf32(%rip), %xmm0
+; CHECK-NEXT:  cvtss2sd   %xmm0, %xmm0
+; CHECK-NEXT:  movsd      %xmm0, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastF32_F128() {
+entry:
+  %0 = load float, float* @vf32, align 4
+  %conv = fpext float %0 to fp128
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: TestCastF32_F128:
+; CHECK:       movss      vf32(%rip), %xmm0
+; CHECK-NEXT:  callq      __extendsftf2
+; CHECK-NEXT:  movaps     %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @TestCastF64_I32() {
+entry:
+  %0 = load double, double* @vf64, align 8
+  %conv = fptosi double %0 to i32
+  store i32 %conv, i32* @vi32, align 4
+  ret void
+; CHECK-LABEL: TestCastF64_I32:
+; CHECK:       cvttsd2si  vf64(%rip), %eax
+; CHECK-NEXT:  movl       %eax, vi32(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastF64_I64() {
+entry:
+  %0 = load double, double* @vf64, align 8
+  %conv = fptosi double %0 to i32
+  %conv1 = sext i32 %conv to i64
+  store i64 %conv1, i64* @vi64, align 8
+  ret void
+; CHECK-LABEL: TestCastF64_I64:
+; CHECK:       cvttsd2si  vf64(%rip), %eax
+; CHECK-NEXT:  cltq
+; CHECK-NEXT:  movq       %rax, vi64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastF64_F32() {
+entry:
+  %0 = load double, double* @vf64, align 8
+  %conv = fptrunc double %0 to float
+  store float %conv, float* @vf32, align 4
+  ret void
+; CHECK-LABEL: TestCastF64_F32:
+; CHECK:       movsd      vf64(%rip), %xmm0
+; CHECK-NEXT:  cvtsd2ss   %xmm0, %xmm0
+; CHECK-NEXT:  movss      %xmm0, vf32(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastF64_F128() {
+entry:
+  %0 = load double, double* @vf64, align 8
+  %conv = fpext double %0 to fp128
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: TestCastF64_F128:
+; CHECK:       movsd      vf64(%rip), %xmm0
+; CHECK-NEXT:  callq      __extenddftf2
+; CHECK-NEXT:  movapd     %xmm0, vf128(%rip)
+; CHECK:       ret
+}
+
+define void @TestCastF128_I32() {
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = fptosi fp128 %0 to i32
+  store i32 %conv, i32* @vi32, align 4
+  ret void
+; CHECK-LABEL: TestCastF128_I32:
+; CHECK:        movaps     vf128(%rip), %xmm0
+; CHECK-NEXT:   callq      __fixtfsi
+; CHECK-NEXT:   movl       %eax, vi32(%rip)
+; CHECK:        retq
+}
+
+define void @TestCastF128_I64() {
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = fptosi fp128 %0 to i32
+  %conv1 = sext i32 %conv to i64
+  store i64 %conv1, i64* @vi64, align 8
+  ret void
+; CHECK-LABEL: TestCastF128_I64:
+; CHECK:       movaps      vf128(%rip), %xmm0
+; CHECK-NEXT:  callq       __fixtfsi
+; CHECK-NEXT:  cltq
+; CHECK-NEXT:  movq        %rax, vi64(%rip)
+; CHECK:       retq
+}
+
+define void @TestCastF128_F32() {
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = fptrunc fp128 %0 to float
+  store float %conv, float* @vf32, align 4
+  ret void
+; CHECK-LABEL: TestCastF128_F32:
+; CHECK:       movaps      vf128(%rip), %xmm0
+; CHECK-NEXT:  callq       __trunctfsf2
+; CHECK-NEXT:  movss       %xmm0, vf32(%rip)
+; CHECK:       retq
+}
+
+define void @TestCastF128_F64() {
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = fptrunc fp128 %0 to double
+  store double %conv, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: TestCastF128_F64:
+; CHECK:       movapd      vf128(%rip), %xmm0
+; CHECK-NEXT:  callq       __trunctfdf2
+; CHECK-NEXT:  movsd       %xmm0, vf64(%rip)
+; CHECK:       retq
+}
+
+define void @TestCastI32_I64() {
+entry:
+  %0 = load i32, i32* @vi32, align 4
+  %conv = sitofp i32 %0 to float
+  %conv1 = fptosi float %conv to i64
+  store i64 %conv1, i64* @vi64, align 8
+  ret void
+; CHECK-LABEL: TestCastI32_I64:
+; CHECK:       cvtsi2ssl  vi32(%rip), %xmm0
+; CHECK-NEXT:  cvttss2si  %xmm0, %rax
+; CHECK-NEXT:  movq       %rax, vi64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastI32_F32() {
+entry:
+  %0 = load i32, i32* @vi32, align 4
+  %conv = sitofp i32 %0 to float
+  store float %conv, float* @vf32, align 4
+  ret void
+; CHECK-LABEL: TestCastI32_F32:
+; CHECK:       cvtsi2ssl  vi32(%rip), %xmm0
+; CHECK-NEXT:  movss      %xmm0, vf32(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastI32_F64() {
+entry:
+  %0 = load i32, i32* @vi32, align 4
+  %conv = sitofp i32 %0 to double
+  store double %conv, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: TestCastI32_F64:
+; CHECK:       cvtsi2sdl  vi32(%rip), %xmm0
+; CHECK-NEXT:  movsd      %xmm0, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastI32_F128() {
+entry:
+  %0 = load i32, i32* @vi32, align 4
+  %conv = sitofp i32 %0 to fp128
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: TestCastI32_F128:
+; CHECK:       movl       vi32(%rip), %edi
+; CHECK-NEXT:  callq      __floatsitf
+; CHECK-NEXT:  movaps     %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @TestCastI64_I32(){
+entry:
+  %0 = load i64, i64* @vi64, align 8
+  %conv = sitofp i64 %0 to float
+  %conv1 = fptosi float %conv to i32
+  store i32 %conv1, i32* @vi32, align 4
+  ret void
+; CHECK-LABEL: TestCastI64_I32:
+; CHECK:       cvtsi2ssq  vi64(%rip), %xmm0
+; CHECK:       cvttss2si  %xmm0, %eax
+; CHECK:       movl       %eax, vi32(%rip)
+; CHECK:       retq
+}
+
+define void @TestCastI64_F32(){
+entry:
+  %0 = load i64, i64* @vi64, align 8
+  %conv = sitofp i64 %0 to float
+  store float %conv, float* @vf32, align 4
+  ret void
+; CHECK-LABEL: TestCastI64_F32:
+; CHECK:       cvtsi2ssq  vi64(%rip), %xmm0
+; CHECK-NEXT:  movss      %xmm0, vf32(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastI64_F64(){
+entry:
+  %0 = load i64, i64* @vi64, align 8
+  %conv = sitofp i64 %0 to double
+  store double %conv, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: TestCastI64_F64:
+; CHECK:       cvtsi2sdq  vi64(%rip), %xmm0
+; CHECK-NEXT:  movsd      %xmm0, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @TestCastI64_F128(){
+entry:
+  %0 = load i64, i64* @vi64, align 8
+  %conv = sitofp i64 %0 to fp128
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: TestCastI64_F128:
+; CHECK:       movq       vi64(%rip), %rdi
+; CHECK-NEXT:  callq      __floatditf
+; CHECK-NEXT:  movaps     %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define i32 @TestConst32(float %v) {
+entry:
+  %cmp = fcmp ogt float %v, 1.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestConst32:
+; CHECK:       ucomiss {{.*}}, %xmm0
+; CHECK-NEXT:  seta %al
+; CHECK:       retq
+}
+
+define i32 @TestConst64(double %v) {
+entry:
+  %cmp = fcmp ogt double %v, 1.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestConst64:
+; CHECK:       ucomisd {{.*}}, %xmm0
+; CHECK-NEXT:  seta %al
+; CHECK:       retq
+}
+
+define i32 @TestConst128(fp128 %v) {
+entry:
+  %cmp = fcmp ogt fp128 %v, 0xL00000000000000003FFF000000000000
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestConst128:
+; CHECK:       movaps {{.*}}, %xmm1
+; CHECK-NEXT:  callq __gttf2
+; CHECK-NEXT:  testl %eax, %eax
+; CHECK-NEXT:  setg %al
+; CHECK:       retq
+}
+
+define i32 @TestBits128(fp128 %ld) {
+entry:
+  %mul = fmul fp128 %ld, %ld
+  %0 = bitcast fp128 %mul to i128
+  %u.sroa.0.4.extract.shift = lshr i128 %0, 32
+  %or5 = or i128 %u.sroa.0.4.extract.shift, %0
+  %or = trunc i128 %or5 to i32
+  %cmp = icmp eq i32 %or, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestBits128:
+; CHECK:       movaps %xmm0, %xmm1
+; CHECK-NEXT:  callq __multf3
+; CHECK-NEXT:  movaps %xmm0, (%rsp)
+; CHECK-NEXT:  movq (%rsp),
+; CHECK-NEXT:  movq %
+; CHECK-NEXT:  shrq $32,
+; CHECK:       orl
+; CHECK-NEXT:  sete %al
+; CHECK-NEXT:  movzbl %al, %eax
+; CHECK:       retq
+}
+
+define fp128 @TestPair128(i64 %a, i64 %b) {
+entry:
+  %conv = zext i64 %a to i128
+  %shl = shl nuw i128 %conv, 64
+  %conv1 = zext i64 %b to i128
+  %or = or i128 %shl, %conv1
+  %add = add i128 %or, 3
+  %0 = bitcast i128 %add to fp128
+  ret fp128 %0
+; CHECK-LABEL: TestPair128:
+; CHECK:       addq $3, %rsi
+; CHECK-NEXT:  movq %rsi, -24(%rsp)
+; CHECK-NEXT:  adcq $0, %rdi
+; CHECK-NEXT:  movq %rdi, -16(%rsp)
+; CHECK-NEXT:  movaps -24(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
+
+define fp128 @TestTruncCopysign(fp128 %x, i32 %n) {
+entry:
+  %cmp = icmp sgt i32 %n, 50000
+  br i1 %cmp, label %if.then, label %cleanup
+
+if.then:                                          ; preds = %entry
+  %conv = fptrunc fp128 %x to double
+  %call = tail call double @copysign(double 0x7FF0000000000000, double %conv) #2
+  %conv1 = fpext double %call to fp128
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.then
+  %retval.0 = phi fp128 [ %conv1, %if.then ], [ %x, %entry ]
+  ret fp128 %retval.0
+; CHECK-LABEL: TestTruncCopysign:
+; CHECK:       callq __trunctfdf2
+; CHECK-NEXT:  andpd {{.*}}, %xmm0
+; CHECK-NEXT:  orpd {{.*}}, %xmm0
+; CHECK-NEXT:  callq __extenddftf2
+; CHECK:       retq
+}
+
+declare double @copysign(double, double) #1
+
+attributes #2 = { nounwind readnone }
Index: test/CodeGen/X86/fp128-compare.ll
===================================================================
--- test/CodeGen/X86/fp128-compare.ll
+++ test/CodeGen/X86/fp128-compare.ll
@@ -0,0 +1,211 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+define i32 @TestComp32GT(float %d1, float %d2) {
+entry:
+  %cmp = fcmp ogt float %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp32GT:
+; CHECK:       ucomiss %xmm1, %xmm0
+; CHECK-NEXT:  seta %al
+}
+
+define i32 @TestComp64GT(double %d1, double %d2) {
+entry:
+  %cmp = fcmp ogt double %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp64GT:
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  seta %al
+}
+
+define i32 @TestComp128GT(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp ogt fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128GT:
+; CHECK:       callq __gttf2
+; CHECK:       setg  %al
+; CHECK:       retq
+}
+
+define i32 @TestComp32GE(float %d1, float %d2) {
+entry:
+  %cmp = fcmp oge float %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp32GE:
+; CHECK:       ucomiss %xmm1, %xmm0
+; CHECK-NEXT:  setae %al
+}
+
+define i32 @TestComp64GE(double %d1, double %d2) {
+entry:
+  %cmp = fcmp oge double %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp64GE:
+; CHECK:       ucomisd %xmm1, %xmm0
+; CHECK-NEXT:  setae %al
+}
+
+define i32 @TestComp128GE(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp oge fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128GE:
+; CHECK:       callq __getf2
+; CHECK:       testl %eax, %eax
+; CHECK:       retq
+}
+
+define i32 @TestComp32LT(float %d1, float %d2) {
+entry:
+  %cmp = fcmp olt float %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp32LT:
+; CHECK:       ucomiss %xmm0, %xmm1
+; CHECK-NEXT:  seta %al
+}
+
+define i32 @TestComp64LT(double %d1, double %d2) {
+entry:
+  %cmp = fcmp olt double %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp64LT:
+; CHECK:       ucomisd %xmm0, %xmm1
+; CHECK-NEXT:  seta %al
+}
+
+define i32 @TestComp128LT(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp olt fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128LT:
+; CHECK:       callq __lttf2
+; CHECK-NEXT:  shrl $31, %eax
+; CHECK:       retq
+}
+
+define i32 @TestComp32LE(float %d1, float %d2) {
+entry:
+  %cmp = fcmp ole float %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp32LE:
+; CHECK:       ucomiss %xmm0, %xmm1
+; CHECK-NEXT:  setae %al
+}
+
+define i32 @TestComp64LE(double %d1, double %d2) {
+entry:
+  %cmp = fcmp ole double %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp64LE:
+; CHECK:       ucomisd %xmm0, %xmm1
+; CHECK-NEXT:  setae %al
+}
+
+define i32 @TestComp128LE(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp ole fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128LE:
+; CHECK:       callq __letf2
+; CHECK-NEXT:  testl %eax, %eax
+; CHECK:       retq
+}
+
+define i32 @TestComp32EQ(float %d1, float %d2) {
+entry:
+  %cmp = fcmp oeq float %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp32EQ:
+; CHECK:       cmpeqss %xmm1, %xmm0
+; CHECK-NEXT:  movd %xmm0, %eax
+; CHECK-NEXT:  andl $1, %eax
+; CHECK-NEXT:  retq
+}
+
+define i32 @TestComp64EQ(double %d1, double %d2) {
+entry:
+  %cmp = fcmp oeq double %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp64EQ:
+; CHECK:       cmpeqsd %xmm1, %xmm0
+; CHECK-NEXT:  movd %xmm0, %rax
+; CHECK-NEXT:  andl $1, %eax
+; CHECK-NEXT:  retq
+}
+
+define i32 @TestComp128EQ(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp oeq fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128EQ:
+; CHECK:       callq __eqtf2
+; CHECK-NEXT:  testl %eax, %eax
+; CHECK:       retq
+}
+
+define i32 @TestComp32NE(float %d1, float %d2) {
+entry:
+  %cmp = fcmp une float %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp32NE:
+; CHECK:       cmpneqss %xmm1, %xmm0
+; CHECK-NEXT:  movd %xmm0, %eax
+; CHECK-NEXT:  andl $1, %eax
+; CHECK-NEXT:  retq
+}
+
+define i32 @TestComp64NE(double %d1, double %d2) {
+entry:
+  %cmp = fcmp une double %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp64NE:
+; CHECK:       cmpneqsd %xmm1, %xmm0
+; CHECK-NEXT:  movd %xmm0, %rax
+; CHECK-NEXT:  andl $1, %eax
+; CHECK-NEXT:  retq
+}
+
+define i32 @TestComp128NE(fp128 %d1, fp128 %d2) {
+entry:
+  %cmp = fcmp une fp128 %d1, %d2
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK-LABEL: TestComp128NE:
+; CHECK:       callq __netf2
+; CHECK-NEXT:  testl %eax, %eax
+; CHECK:       retq
+}
+
+define fp128 @TestMax(fp128 %x, fp128 %y) {
+entry:
+  %cmp = fcmp ogt fp128 %x, %y
+  %cond = select i1 %cmp, fp128 %x, fp128 %y
+  ret fp128 %cond
+; CHECK-LABEL: TestMax:
+; CHECK: movaps %xmm1
+; CHECK: movaps %xmm0
+; CHECK: callq __gttf2
+; CHECK: movaps {{.*}}, %xmm0
+; CHECK: testl %eax, %eax
+; CHECK: movaps {{.*}}, %xmm0
+; CHECK: retq
+}
Index: test/CodeGen/X86/fp128-i128.ll
===================================================================
--- test/CodeGen/X86/fp128-i128.ll
+++ test/CodeGen/X86/fp128-i128.ll
@@ -0,0 +1,241 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; Check some i128 instruction patterns triggered by fp128.
+
+define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
+entry:
+  %0 = bitcast fp128 %s to i128
+  %1 = zext i64 %n to i128
+  %bf.value = shl nuw i128 %1, 64
+  %bf.shl = and i128 %bf.value, 5192296858534809181786422619668480
+  %bf.clear = and i128 %0, -5192296858534809181786422619668481
+  %bf.set = or i128 %bf.shl, %bf.clear
+  %2 = bitcast i128 %bf.set to fp128
+  tail call void @foo(fp128 %2) #2
+  ret void
+; CHECK-LABEL: TestUnionLD1:
+; CHECK:       movaps %xmm0, -24(%rsp)
+; CHECK-NEXT:  movq -24(%rsp), %rax
+; CHECK-NEXT:  movabsq $281474976710655, %rcx
+; CHECK-NEXT:  andq %rdi, %rcx
+; CHECK-NEXT:  movabsq $-281474976710656, %rdx
+; CHECK-NEXT:  andq -16(%rsp), %rdx
+; CHECK-NEXT:  movq %rax, -40(%rsp)
+; CHECK-NEXT:  orq %rcx, %rdx
+; CHECK-NEXT:  movq %rdx, -32(%rsp)
+; CHECK-NEXT:  movaps -40(%rsp), %xmm0
+; CHECK-NEXT:  jmp foo
+}
+
+define fp128 @TestUnionLD2(fp128 %s) #0 {
+entry:
+  %0 = bitcast fp128 %s to i128
+  %bf.clear = and i128 %0, -18446744073709551616
+  %1 = bitcast i128 %bf.clear to fp128
+  ret fp128 %1
+; CHECK-LABEL: TestUnionLD2:
+; CHECK:       movaps %xmm0, -24(%rsp)
+; CHECK-NEXT:  movq -16(%rsp), %rax
+; CHECK-NEXT:  movq %rax, -32(%rsp)
+; CHECK-NEXT:  movq $0, -40(%rsp)
+; CHECK-NEXT:  movaps -40(%rsp), %xmm0
+; CHECK-NEXT:  retq
+}
+
+define fp128 @TestI128_1(fp128 %x) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %bf.clear = and i128 %0, 170141183460469231731687303715884105727
+  %1 = bitcast i128 %bf.clear to fp128
+  %cmp = fcmp olt fp128 %1, 0xL999999999999999A3FFB999999999999
+  %cond = select i1 %cmp, fp128 0xL00000000000000003FFF000000000000, fp128 0xL00000000000000004000000000000000
+  ret fp128 %cond
+; CHECK-LABEL: TestI128_1:
+; CHECK:       movaps %xmm0,
+; CHECK:       movabsq $9223372036854775807,
+; CHECK:       callq __lttf2
+; CHECK:       testl %eax, %eax
+; CHECK:       movaps {{.*}}, %xmm0
+; CHECK:       retq
+}
+
+define fp128 @TestI128_2(fp128 %x, fp128 %y) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %cmp = icmp sgt i128 %0, -1
+  %cond = select i1 %cmp, fp128 %x, fp128 %y
+  ret fp128 %cond
+; CHECK-LABEL: TestI128_2:
+; CHECK:       movaps %xmm0, -24(%rsp)
+; CHECK-NEXT:  cmpq $0, -16(%rsp)
+; CHECK-NEXT:  jns
+; CHECK:       movaps %xmm1, %xmm0
+; CHECK:       retq
+}
+
+define fp128 @TestI128_3(fp128 %x, i32* nocapture readnone %ex) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %bf.cast = and i128 %0, 170135991163610696904058773219554885632
+  %cmp = icmp eq i128 %bf.cast, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %mul = fmul fp128 %x, 0xL00000000000000004201000000000000
+  %1 = bitcast fp128 %mul to i128
+  %bf.clear4 = and i128 %1, -170135991163610696904058773219554885633
+  %bf.set = or i128 %bf.clear4, 85060207136517546210586590865283612672
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %u.sroa.0.0 = phi i128 [ %bf.set, %if.then ], [ %0, %entry ]
+  %2 = bitcast i128 %u.sroa.0.0 to fp128
+  ret fp128 %2
+; CHECK-LABEL: TestI128_3:
+; CHECK:       movaps %xmm0,
+; CHECK:       movabsq $9223090561878065152,
+; CHECK:       testq
+; CHECK:       callq __multf3
+; CHECK-NEXT:  movaps %xmm0
+; CHECK:       movabsq $-9223090561878065153,
+; CHECK:       movabsq $4611123068473966592,
+; CHECK:       retq
+}
+
+define fp128 @TestI128_4(fp128 %x) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %bf.clear = and i128 %0, -18446744073709551616
+  %1 = bitcast i128 %bf.clear to fp128
+  %add = fadd fp128 %1, %x
+  ret fp128 %add
+; CHECK-LABEL: TestI128_4:
+; CHECK:       movaps %xmm0, %xmm1
+; CHECK-NEXT:  movaps %xmm1, 16(%rsp)
+; CHECK-NEXT:  movq 24(%rsp), %rax
+; CHECK-NEXT:  movq %rax, 8(%rsp)
+; CHECK-NEXT:  movq $0, (%rsp)
+; CHECK-NEXT:  movaps (%rsp), %xmm0
+; CHECK-NEXT:  callq __addtf3
+; CHECK:       retq
+}
+
+define { i64, i64 } @TestShift128(i64 %x.coerce0, i64 %x.coerce1) #0 {
+entry:
+  %.fca.1.insert = insertvalue { i64, i64 } { i64 0, i64 undef }, i64 %x.coerce0, 1
+  ret { i64, i64 } %.fca.1.insert
+; CHECK-LABEL: TestShift128:
+; CHECK:       xorl %eax, %eax
+; CHECK-NEXT:  movq %rdi, %rdx
+; CHECK-NEXT:  retq
+}
+
+@v128 = common global i128 0, align 16
+@v128_2 = common global i128 0, align 16
+
+define void @TestShift128_2() #2 {
+entry:
+  %0 = load i128, i128* @v128, align 16
+  %shl = shl i128 %0, 96
+  %1 = load i128, i128* @v128_2, align 16
+  %or = or i128 %shl, %1
+  store i128 %or, i128* @v128, align 16
+  ret void
+; CHECK-LABEL: TestShift128_2:
+; CHECK:       movq v128(%rip), %rax
+; CHECK-NEXT:  shlq $32, %rax
+; CHECK-NEXT:  movq v128_2(%rip), %rcx
+; CHECK-NEXT:  orq v128_2+8(%rip), %rax
+; CHECK-NEXT:  movq %rcx, v128(%rip)
+; CHECK-NEXT:  movq %rax, v128+8(%rip)
+; CHECK-NEXT:  retq
+}
+
+define fp128 @acosl(fp128 %x) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %bf.clear = and i128 %0, -18446744073709551616
+  %1 = bitcast i128 %bf.clear to fp128
+  %add = fadd fp128 %1, %x
+  ret fp128 %add
+; CHECK-LABEL: acosl:
+; CHECK:       movaps %xmm0, %xmm1
+; CHECK-NEXT:  movaps %xmm1, 16(%rsp)
+; CHECK-NEXT:  movq 24(%rsp), %rax
+; CHECK-NEXT:  movq %rax, 8(%rsp)
+; CHECK-NEXT:  movq $0, (%rsp)
+; CHECK-NEXT:  movaps (%rsp), %xmm0
+; CHECK-NEXT:  callq __addtf3
+; CHECK:       retq
+}
+
+; Compare i128 values and check i128 constants.
+define fp128 @TestComp(fp128 %x, fp128 %y) #0 {
+entry:
+  %0 = bitcast fp128 %x to i128
+  %cmp = icmp sgt i128 %0, -1
+  %cond = select i1 %cmp, fp128 %x, fp128 %y
+  ret fp128 %cond
+; CHECK-LABEL: TestComp:
+; CHECK:       movaps %xmm0, -24(%rsp)
+; CHECK-NEXT:  cmpq $0, -16(%rsp)
+; CHECK-NEXT:  jns
+; CHECK:       movaps %xmm1, %xmm0
+; CHECK:       retq
+}
+
+declare void @foo(fp128) #1
+
+; Test logical operations on fp128 values.
+define fp128 @TestFABS_LD(fp128 %x) #0 {
+entry:
+  %call = tail call fp128 @fabsl(fp128 %x) #2
+  ret fp128 %call
+; CHECK-LABEL: TestFABS_LD
+; CHECK:       andps {{.*}}, %xmm0
+; CHECK-NEXT:  retq
+}
+
+declare fp128 @fabsl(fp128) #1
+
+declare fp128 @copysignl(fp128, fp128) #1
+
+; Test more complicated logical operations generated from copysignl.
+define void @TestCopySign({ fp128, fp128 }* noalias nocapture sret %agg.result, { fp128, fp128 }* byval nocapture readonly align 16 %z) #0 {
+entry:
+  %z.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 0
+  %z.real = load fp128, fp128* %z.realp, align 16
+  %z.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 1
+  %z.imag4 = load fp128, fp128* %z.imagp, align 16
+  %cmp = fcmp ogt fp128 %z.real, %z.imag4
+  %sub = fsub fp128 %z.imag4, %z.imag4
+  br i1 %cmp, label %if.then, label %cleanup
+
+if.then:                                          ; preds = %entry
+  %call = tail call fp128 @fabsl(fp128 %sub) #2
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.then
+  %z.real.sink = phi fp128 [ %z.real, %if.then ], [ %sub, %entry ]
+  %call.sink = phi fp128 [ %call, %if.then ], [ %z.real, %entry ]
+  %call5 = tail call fp128 @copysignl(fp128 %z.real.sink, fp128 %z.imag4) #2
+  %0 = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %agg.result, i64 0, i32 0
+  %1 = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %agg.result, i64 0, i32 1
+  store fp128 %call.sink, fp128* %0, align 16
+  store fp128 %call5, fp128* %1, align 16
+  ret void
+; CHECK-LABEL: TestCopySign
+; CHECK-NOT:   call
+; CHECK:       callq __subtf3
+; CHECK-NOT:   call
+; CHECK:       callq __gttf2
+; CHECK-NOT:   call
+; CHECK:       andps {{.*}}, %xmm0
+; CHECK:       retq
+}
+
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
Index: test/CodeGen/X86/fp128-libcalls.ll
===================================================================
--- test/CodeGen/X86/fp128-libcalls.ll
+++ test/CodeGen/X86/fp128-libcalls.ll
@@ -0,0 +1,201 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; Check all soft floating point library function calls.
+
+@vf64 = common global double 0.000000e+00, align 8
+@vf128 = common global fp128 0xL00000000000000000000000000000000, align 16
+
+define void @Test64Add(double %d1, double %d2) {
+entry:
+  %add = fadd double %d1, %d2
+  store double %add, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: Test64Add:
+; CHECK:       addsd %xmm1, %xmm0
+; CHECK-NEXT:  movsd %xmm0, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @Test64_1Add(double %d1) {
+entry:
+  %0 = load double, double* @vf64, align 8
+  %add = fadd double %0, %d1
+  store double %add, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: Test64_1Add:
+; CHECK:       addsd vf64(%rip), %xmm0
+; CHECK-NEXT:  movsd %xmm0, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @Test128Add(fp128 %d1, fp128 %d2) {
+entry:
+  %add = fadd fp128 %d1, %d2
+  store fp128 %add, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128Add:
+; CHECK:       callq __addtf3
+; CHECK-NEXT:  movaps %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128_1Add(fp128 %d1){
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %add = fadd fp128 %0, %d1
+  store fp128 %add, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128_1Add:
+; CHECK:       movaps  %xmm0, %xmm1
+; CHECK-NEXT:  movaps  vf128(%rip), %xmm0
+; CHECK-NEXT:  callq   __addtf3
+; CHECK-NEXT:  movaps  %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test64Sub(double %d1, double %d2){
+entry:
+  %sub = fsub double %d1, %d2
+  store double %sub, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: Test64Sub:
+; CHECK:       subsd %xmm1, %xmm0
+; CHECK-NEXT:  movsd %xmm0, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @Test64_1Sub(double %d1){
+entry:
+  %0 = load double, double* @vf64, align 8
+  %sub = fsub double %0, %d1
+  store double %sub, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: Test64_1Sub:
+; CHECK:       movsd vf64(%rip), %xmm1
+; CHECK-NEXT:  subsd   %xmm0, %xmm1
+; CHECK-NEXT:  movsd %xmm1, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @Test128Sub(fp128 %d1, fp128 %d2){
+entry:
+  %sub = fsub fp128 %d1, %d2
+  store fp128 %sub, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128Sub:
+; CHECK:       callq __subtf3
+; CHECK-NEXT:  movaps %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128_1Sub(fp128 %d1){
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %sub = fsub fp128 %0, %d1
+  store fp128 %sub, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128_1Sub:
+; CHECK:       movaps  %xmm0, %xmm1
+; CHECK-NEXT:  movaps  vf128(%rip), %xmm0
+; CHECK-NEXT:  callq   __subtf3
+; CHECK-NEXT:  movaps  %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test64Mul(double %d1, double %d2){
+entry:
+  %mul = fmul double %d1, %d2
+  store double %mul, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: Test64Mul:
+; CHECK:       mulsd %xmm1, %xmm0
+; CHECK-NEXT:  movsd %xmm0, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @Test64_1Mul(double %d1){
+entry:
+  %0 = load double, double* @vf64, align 8
+  %mul = fmul double %0, %d1
+  store double %mul, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: Test64_1Mul:
+; CHECK:       mulsd vf64(%rip), %xmm0
+; CHECK-NEXT:  movsd %xmm0, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @Test128Mul(fp128 %d1, fp128 %d2){
+entry:
+  %mul = fmul fp128 %d1, %d2
+  store fp128 %mul, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128Mul:
+; CHECK:       callq __multf3
+; CHECK-NEXT:  movaps %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128_1Mul(fp128 %d1){
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %mul = fmul fp128 %0, %d1
+  store fp128 %mul, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128_1Mul:
+; CHECK:       movaps  %xmm0, %xmm1
+; CHECK-NEXT:  movaps  vf128(%rip), %xmm0
+; CHECK-NEXT:  callq   __multf3
+; CHECK-NEXT:  movaps  %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test64Div(double %d1, double %d2){
+entry:
+  %div = fdiv double %d1, %d2
+  store double %div, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: Test64Div:
+; CHECK:       divsd %xmm1, %xmm0
+; CHECK-NEXT:  movsd %xmm0, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @Test64_1Div(double %d1){
+entry:
+  %0 = load double, double* @vf64, align 8
+  %div = fdiv double %0, %d1
+  store double %div, double* @vf64, align 8
+  ret void
+; CHECK-LABEL: Test64_1Div:
+; CHECK:       movsd vf64(%rip), %xmm1
+; CHECK-NEXT:  divsd %xmm0, %xmm1
+; CHECK-NEXT:  movsd %xmm1, vf64(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @Test128Div(fp128 %d1, fp128 %d2){
+entry:
+  %div = fdiv fp128 %d1, %d2
+  store fp128 %div, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128Div:
+; CHECK:       callq __divtf3
+; CHECK-NEXT:  movaps %xmm0, vf128(%rip)
+; CHECK:       retq
+}
+
+define void @Test128_1Div(fp128 %d1){
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %div = fdiv fp128 %0, %d1
+  store fp128 %div, fp128* @vf128, align 16
+  ret void
+; CHECK-LABEL: Test128_1Div:
+; CHECK:       movaps  %xmm0, %xmm1
+; CHECK-NEXT:  movaps  vf128(%rip), %xmm0
+; CHECK-NEXT:  callq   __divtf3
+; CHECK-NEXT:  movaps  %xmm0, vf128(%rip)
+; CHECK:       retq
+}
Index: test/CodeGen/X86/fp128-load.ll
===================================================================
--- test/CodeGen/X86/fp128-load.ll
+++ test/CodeGen/X86/fp128-load.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; double myD = 1.0;
+@my_double = global double 1.000000e+00, align 8
+
+; long double myFP80 = 1.0L;  // x86_64-linux-gnu
+@my_fp80 = global x86_fp80 0xK3FFF8000000000000000, align 16
+
+; long double myFP128 = 1.0L;  // x86_64-linux-android
+@my_fp128 = global fp128 0xL00000000000000003FFF000000000000, align 16
+
+define double @get_double() {
+entry:
+  %0 = load double, double* @my_double, align 8
+  ret double %0
+; CHECK-LABEL: get_double:
+; CHECK:       movsd   my_double(%rip), %xmm0
+; CHECK-NEXT:  retq
+}
+
+define x86_fp80 @get_fp80() {
+entry:
+  %0 = load x86_fp80, x86_fp80* @my_fp80, align 16
+  ret x86_fp80 %0
+; CHECK-LABEL: get_fp80:
+; CHECK:       fldt    my_fp80(%rip)
+; CHECK-NEXT:  retq
+}
+
+define fp128 @get_fp128() {
+entry:
+  %0 = load fp128, fp128* @my_fp128, align 16
+  ret fp128 %0
+; CHECK-LABEL: get_fp128:
+; CHECK:       movaps my_fp128(%rip), %xmm0
+; CHECK-NEXT:  retq
+}
+
+@TestLoadExtend.data = internal unnamed_addr constant [2 x float] [float 0x3FB99999A0000000, float 0x3FC99999A0000000], align 4
+
+define fp128 @TestLoadExtend(fp128 %x, i32 %n) {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds [2 x float], [2 x float]* @TestLoadExtend.data, i64 0, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %conv = fpext float %0 to fp128
+  ret fp128 %conv
+; CHECK-LABEL: TestLoadExtend:
+; CHECK:       movslq  %edi, %rax
+; CHECK-NEXT:  movss   TestLoadExtend.data(,%rax,4), %xmm0
+; CHECK-NEXT:  callq   __extendsftf2
+; CHECK:       retq
+}
+
+; CHECK-LABEL:  my_double:
+; CHECK-NEXT:  .quad   4607182418800017408
+; CHECK-NEXT:  .size   my_double, 8
+
+; CHECK-LABEL:  my_fp80:
+; CHECK-NEXT:  .quad   -9223372036854775808
+; CHECK-NEXT:  .short  16383
+; CHECK-NEXT:  .zero   6
+; CHECK-NEXT:  .size   my_fp80, 16
+
+; CHECK-LABEL:  my_fp128:
+; CHECK-NEXT:  .quad   0
+; CHECK-NEXT:  .quad   4611404543450677248
+; CHECK-NEXT:  .size   my_fp128, 16
Index: test/CodeGen/X86/fp128-store.ll
===================================================================
--- test/CodeGen/X86/fp128-store.ll
+++ test/CodeGen/X86/fp128-store.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+
+; double myD = 1.0;
+@myD = global double 1.000000e+00, align 8
+
+; long double myFP80 = 1.0L;  // x86_64-linux-gnu
+@myFP80 = global x86_fp80 0xK3FFF8000000000000000, align 16
+
+; long double myFP128 = 1.0L;  // x86_64-linux-android
+@myFP128 = global fp128 0xL00000000000000003FFF000000000000, align 16
+
+define void @set_D(double %x) {
+entry:
+  store double %x, double* @myD, align 8
+  ret void
+; CHECK-LABEL: set_D:
+; CHECK:       movsd   %xmm0, myD(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @set_FP80(x86_fp80 %x) {
+entry:
+  store x86_fp80 %x, x86_fp80* @myFP80, align 16
+  ret void
+; CHECK-LABEL: set_FP80:
+; CHECK:       fldt    8(%rsp)
+; CHECK-NEXT:  fstpt   myFP80(%rip)
+; CHECK-NEXT:  retq
+}
+
+define void @set_FP128(fp128 %x) {
+entry:
+  store fp128 %x, fp128* @myFP128, align 16
+  ret void
+; CHECK-LABEL: set_FP128:
+; CHECK:       movaps  %xmm0, myFP128(%rip)
+; CHECK-NEXT:  retq
+}
Index: test/CodeGen/X86/soft-fp.ll
===================================================================
--- test/CodeGen/X86/soft-fp.ll
+++ test/CodeGen/X86/soft-fp.ll
@@ -1,8 +1,14 @@
-; RUN: llc < %s -march=x86    -mattr=+sse2,+soft-float | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mattr=+sse2,+soft-float | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-gnux32 -mattr=+sse2,+soft-float | FileCheck %s
+; RUN: llc < %s -march=x86    -mattr=+mmx,+sse,+soft-float \
+; RUN:     | FileCheck %s --check-prefix=SOFT1 --check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2,+soft-float \
+; RUN:     | FileCheck %s --check-prefix=SOFT2 --check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse \
+; RUN:     | FileCheck %s --check-prefix=SSE1 --check-prefix=CHECK
+; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 \
+; RUN:     | FileCheck %s --check-prefix=SSE2 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-gnux32 -mattr=+mmx,+sse2,+soft-float | FileCheck %s
 
-; CHECK-NOT: xmm{[0-9]+}
+; CHECK-NOT: xmm{{[0-9]+}}
 
 %struct.__va_list_tag = type { i32, i32, i8*, i8* }
 
@@ -15,6 +21,8 @@
 	call void @bar(%struct.__va_list_tag* %va3) nounwind
 	call void @llvm.va_end(i8* %va12)
 	ret i32 undef
+; CHECK-LABEL: t1:
+; CHECK:       ret{{[lq]}}
 }
 
 declare void @llvm.va_start(i8*) nounwind
@@ -27,4 +35,23 @@
 entry:
 	%0 = fadd float %a, %b		; <float> [#uses=1]
 	ret float %0
+; CHECK-LABEL: t2:
+; SOFT1-NOT:   xmm{{[0-9]+}}
+; SOFT2-NOT:   xmm{{[0-9]+}}
+; SSE1:        xmm{{[0-9]+}}
+; SSE2:        xmm{{[0-9]+}}
+; CHECK:       ret{{[lq]}}
+}
+
+; soft-float means no SSE instruction and passing fp128 as pair of i64.
+define fp128 @t3(fp128 %a, fp128 %b) nounwind readnone {
+entry:
+	%0 = fadd fp128 %b, %a
+	ret fp128 %0
+; CHECK-LABEL: t3:
+; SOFT1-NOT:   xmm{{[0-9]+}}
+; SOFT2-NOT:   xmm{{[0-9]+}}
+; SSE1:        xmm{{[0-9]+}}
+; SSE2:        xmm{{[0-9]+}}
+; CHECK:       ret{{[lq]}}
 }
Index: utils/TableGen/X86RecognizableInstr.cpp
===================================================================
--- utils/TableGen/X86RecognizableInstr.cpp
+++ utils/TableGen/X86RecognizableInstr.cpp
@@ -951,6 +951,7 @@
   TYPE("f128mem",             TYPE_M128)
   TYPE("f256mem",             TYPE_M256)
   TYPE("f512mem",             TYPE_M512)
+  TYPE("FR128",               TYPE_XMM128)
   TYPE("FR64",                TYPE_XMM64)
   TYPE("FR64X",               TYPE_XMM64)
   TYPE("f64mem",              TYPE_M64FP)
@@ -1069,6 +1070,7 @@
   // register IDs in 8-bit immediates nowadays.
   ENCODING("FR32",            ENCODING_IB)
   ENCODING("FR64",            ENCODING_IB)
+  ENCODING("FR128",           ENCODING_IB)
   ENCODING("VR128",           ENCODING_IB)
   ENCODING("VR256",           ENCODING_IB)
   ENCODING("FR32X",           ENCODING_IB)
@@ -1091,6 +1093,7 @@
   ENCODING("GR8",             ENCODING_RM)
   ENCODING("VR128",           ENCODING_RM)
   ENCODING("VR128X",          ENCODING_RM)
+  ENCODING("FR128",           ENCODING_RM)
   ENCODING("FR64",            ENCODING_RM)
   ENCODING("FR32",            ENCODING_RM)
   ENCODING("FR64X",           ENCODING_RM)
@@ -1120,6 +1123,7 @@
   ENCODING("GR64",            ENCODING_REG)
   ENCODING("GR8",             ENCODING_REG)
   ENCODING("VR128",           ENCODING_REG)
+  ENCODING("FR128",           ENCODING_REG)
   ENCODING("FR64",            ENCODING_REG)
   ENCODING("FR32",            ENCODING_REG)
   ENCODING("VR64",            ENCODING_REG)
@@ -1157,6 +1161,7 @@
   ENCODING("GR32",            ENCODING_VVVV)
   ENCODING("GR64",            ENCODING_VVVV)
   ENCODING("FR32",            ENCODING_VVVV)
+  ENCODING("FR128",           ENCODING_VVVV)
   ENCODING("FR64",            ENCODING_VVVV)
   ENCODING("VR128",           ENCODING_VVVV)
   ENCODING("VR256",           ENCODING_VVVV)