diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -817,10 +817,10 @@
       // Handled in ARM's setABI().
     } else if (Triple.getArch() == llvm::Triple::x86) {
       this->resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-"
-                            "i64:64-n8:16:32-S128");
+                            "i64:64-i128:128-n8:16:32-S128");
     } else if (Triple.getArch() == llvm::Triple::x86_64) {
       this->resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-"
-                            "i64:64-n8:16:32:64-S128");
+                            "i64:64-i128:128-n8:16:32:64-S128");
     } else if (Triple.getArch() == llvm::Triple::mipsel) {
       // Handled on mips' setDataLayout.
     } else {
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -431,13 +431,12 @@
     LongDoubleWidth = 96;
     LongDoubleAlign = 32;
     SuitableAlign = 128;
-    resetDataLayout(
-        Triple.isOSBinFormatMachO()
-            ? "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-"
-              "f80:32-n8:16:32-S128"
-            : "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-"
-              "f80:32-n8:16:32-S128",
-        Triple.isOSBinFormatMachO() ? "_" : "");
+    resetDataLayout(Triple.isOSBinFormatMachO()
+                        ? "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:"
+                          "128-f64:32:64-f80:32-n8:16:32-S128"
+                        : "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:"
+                          "128-f64:32:64-f80:32-n8:16:32-S128",
+                    Triple.isOSBinFormatMachO() ? "_" : "");
     SizeType = UnsignedInt;
     PtrDiffType = SignedInt;
     IntPtrType = SignedInt;
@@ -542,8 +541,9 @@
       UseSignedCharForObjCBool = false;
     SizeType = UnsignedLong;
     IntPtrType = SignedLong;
-    resetDataLayout("e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-"
-                    "f80:128-n8:16:32-S128", "_");
+    resetDataLayout("e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-"
+                    "f64:32:64-f80:128-n8:16:32-S128",
+                    "_");
     HasAlignMac68kSupport = true;
   }
 
@@ -570,7 +570,7 @@
         getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
     bool IsMSVC = getTriple().isWindowsMSVCEnvironment();
     std::string Layout = IsWinCOFF ? "e-m:x" : "e-m:e";
-    Layout += "-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-";
+    Layout += "-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-";
     Layout += IsMSVC ? "f80:128" : "f80:32";
     Layout += "-n8:16:32-a:0:32-S32";
     resetDataLayout(Layout, IsWinCOFF ? "_" : "");
@@ -621,8 +621,8 @@
       : X86_32TargetInfo(Triple, Opts) {
     this->WCharType = TargetInfo::UnsignedShort;
     DoubleAlign = LongLongAlign = 64;
-    resetDataLayout("e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:"
-                    "32-n8:16:32-a:0:32-S32",
+    resetDataLayout("e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-"
+                    "i128:128-f80:32-n8:16:32-a:0:32-S32",
                     "_");
   }
 
@@ -660,8 +660,8 @@
       : X86_32TargetInfo(Triple, Opts) {
     LongDoubleWidth = 64;
     LongDoubleFormat = &llvm::APFloat::IEEEdouble();
-    resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:32-f64:"
-                    "32-f128:32-n8:16:32-a:0:32-S32");
+    resetDataLayout("e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:32-"
+                    "f64:32-f128:32-n8:16:32-a:0:32-S32");
     WIntType = UnsignedInt;
   }
 
@@ -721,11 +721,11 @@
 
     // Pointers are 32-bit in x32.
     resetDataLayout(IsX32 ? "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-"
-                            "i64:64-f80:128-n8:16:32:64-S128"
-                          : IsWinCOFF ? "e-m:w-p270:32:32-p271:32:32-p272:64:"
-                                        "64-i64:64-f80:128-n8:16:32:64-S128"
-                                      : "e-m:e-p270:32:32-p271:32:32-p272:64:"
-                                        "64-i64:64-f80:128-n8:16:32:64-S128");
+                            "i64:64-i128:128-f80:128-n8:16:32:64-S128"
+                    : IsWinCOFF ? "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:"
+                                  "64-i128:128-f80:128-n8:16:32:64-S128"
+                                : "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:"
+                                  "64-i128:128-f80:128-n8:16:32:64-S128");
 
     // Use fpret only for long double.
     RealTypeUsesObjCFPRetMask = (unsigned)FloatModeKind::LongDouble;
@@ -922,8 +922,9 @@
     llvm::Triple T = llvm::Triple(Triple);
     if (T.isiOS())
       UseSignedCharForObjCBool = false;
-    resetDataLayout("e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:"
-                    "16:32:64-S128", "_");
+    resetDataLayout("e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
+                    "f80:128-n8:16:32:64-S128",
+                    "_");
   }
 
   bool handleTargetFeatures(std::vector<std::string> &Features,
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -1,26 +1,26 @@
 // RUN: %clang_cc1 -triple i686-unknown-unknown -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=I686-UNKNOWN %s
-// I686-UNKNOWN: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128"
+// I686-UNKNOWN: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
 
 // RUN: %clang_cc1 -triple i686-apple-darwin9 -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=I686-DARWIN %s
-// I686-DARWIN: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:128-n8:16:32-S128"
+// I686-DARWIN: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:128-n8:16:32-S128"
 
 // RUN: %clang_cc1 -triple i686-unknown-win32 -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=I686-WIN32 %s
-// I686-WIN32: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32-a:0:32-S32"
+// I686-WIN32: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-a:0:32-S32"
 
 // RUN: %clang_cc1 -triple i686-unknown-cygwin -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=I686-CYGWIN %s
-// I686-CYGWIN: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:32-n8:16:32-a:0:32-S32"
+// I686-CYGWIN: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:32-n8:16:32-a:0:32-S32"
 
 // RUN: %clang_cc1 -triple i686-pc-macho -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=I686-MACHO %s
-// I686-MACHO: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-f64:32:64-f80:32-n8:16:32-S128"
+// I686-MACHO: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
 
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=X86_64 %s
-// X86_64: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+// X86_64: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 
 // RUN: %clang_cc1 -triple xcore-unknown-unknown -emit-llvm -o - %s | \
 // RUN:     FileCheck --check-prefix=XCORE %s
@@ -92,11 +92,11 @@
 
 // RUN: %clang_cc1 -triple i686-nacl -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=I686-NACL
-// I686-NACL: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-n8:16:32-S128"
+// I686-NACL: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n8:16:32-S128"
 
 // RUN: %clang_cc1 -triple x86_64-nacl -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=X86_64-NACL
-// X86_64-NACL: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-n8:16:32:64-S128"
+// X86_64-NACL: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n8:16:32:64-S128"
 
 // RUN: %clang_cc1 -triple arm-nacl -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=ARM-NACL
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -129,6 +129,11 @@
 Changes to the X86 Backend
 --------------------------
 
+* The ``i128`` type now matches GCC and clang's ``__int128`` type. This mainly
+  benefits external projects such as Rust which aim to be binary compatible
+  with C, but also fixes code generation where LLVM already assumed that the
+  type matched and called into libgcc helper functions.
+
 Changes to the OCaml bindings
 -----------------------------
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5201,13 +5201,29 @@
   // If the datalayout matches the expected format, add pointer size address
   // spaces to the datalayout.
   std::string AddrSpaces = "-p270:32:32-p271:32:32-p272:64:64";
-  if (!DL.contains(AddrSpaces)) {
+  if (StringRef Ref = Res; !Ref.contains(AddrSpaces)) {
     SmallVector<StringRef, 4> Groups;
     Regex R("(e-m:[a-z](-p:32:32)?)(-[if]64:.*$)");
-    if (R.match(DL, &Groups))
+    if (R.match(Res, &Groups))
       Res = (Groups[1] + AddrSpaces + Groups[3]).str();
   }
 
+  // i128 values need to be 16-byte-aligned. LLVM already called into libgcc
+  // for i128 operations prior to this being reflected in the data layout, and
+  // clang mostly produced LLVM IR that already aligned i128 to 16 byte
+  // boundaries, so although this is a breaking change, the upgrade is expected
+  // to fix more IR than it breaks.
+  // Intel MCU is an exception and uses 4-byte-alignment.
+  if (!T.isOSIAMCU()) {
+    std::string I128 = "-i128:128";
+    if (StringRef Ref = Res; !Ref.contains(I128)) {
+      SmallVector<StringRef, 4> Groups;
+      Regex R("^(e(-[mpi][^-]*)*)((-[^mpi][^-]*)*)$");
+      if (R.match(Res, &Groups))
+        Res = (Groups[1] + I128 + Groups[3]).str();
+    }
+  }
+
   // For 32-bit MSVC targets, raise the alignment of f80 values to 16 bytes.
   // Raising the alignment is safe because Clang did not produce f80 values in
   // the MSVC environment before this upgrade was added.
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -130,12 +130,14 @@
   Ret += "-p270:32:32-p271:32:32-p272:64:64";
 
   // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32.
+  // 128 bit integers are not specified in the 32-bit ABIs but are used
+  // internally for lowering f128, so we match the alignment to that.
   if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl())
-    Ret += "-i64:64";
+    Ret += "-i64:64-i128:128";
   else if (TT.isOSIAMCU())
     Ret += "-i64:32-f64:32";
   else
-    Ret += "-f64:32:64";
+    Ret += "-i128:128-f64:32:64";
 
   // Some ABIs align long double to 128 bits, others to 32.
   if (TT.isOSNaCl() || TT.isOSIAMCU())
diff --git a/llvm/test/Bitcode/upgrade-datalayout.ll b/llvm/test/Bitcode/upgrade-datalayout.ll
--- a/llvm/test/Bitcode/upgrade-datalayout.ll
+++ b/llvm/test/Bitcode/upgrade-datalayout.ll
@@ -5,5 +5,5 @@
 target datalayout = "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+; CHECK: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Bitcode/upgrade-datalayout2.ll b/llvm/test/Bitcode/upgrade-datalayout2.ll
--- a/llvm/test/Bitcode/upgrade-datalayout2.ll
+++ b/llvm/test/Bitcode/upgrade-datalayout2.ll
@@ -2,6 +2,12 @@
 ; match a possible x86 datalayout.
 ;
 ; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s
+;
+; XFAIL: *
+; No implementation of the data layout upgrade ever checked whether the data
+; layout was a possible x86 data layout, so the logic that this test aims to
+; check was never implemented. We always upgraded data layouts that were not
+; possible x86 data layouts, we merely did not previously upgrade this one.
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Bitcode/upgrade-datalayout3.ll b/llvm/test/Bitcode/upgrade-datalayout3.ll
--- a/llvm/test/Bitcode/upgrade-datalayout3.ll
+++ b/llvm/test/Bitcode/upgrade-datalayout3.ll
@@ -5,4 +5,4 @@
 target datalayout = "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32"
 target triple = "i686-pc-windows-msvc"
 
-; CHECK: target datalayout = "e-m:w-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32-S32"
+; CHECK: target datalayout = "e-m:w-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-S32"
diff --git a/llvm/test/Bitcode/upgrade-datalayout4.ll b/llvm/test/Bitcode/upgrade-datalayout4.ll
--- a/llvm/test/Bitcode/upgrade-datalayout4.ll
+++ b/llvm/test/Bitcode/upgrade-datalayout4.ll
@@ -5,4 +5,4 @@
 target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i686-pc-windows-msvc"
 
-; CHECK: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32-a:0:32-S32"
+; CHECK: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-a:0:32-S32"
diff --git a/llvm/test/Bitcode/upgrade-datalayout.ll b/llvm/test/Bitcode/upgrade-datalayout5.ll
copy from llvm/test/Bitcode/upgrade-datalayout.ll
copy to llvm/test/Bitcode/upgrade-datalayout5.ll
--- a/llvm/test/Bitcode/upgrade-datalayout.ll
+++ b/llvm/test/Bitcode/upgrade-datalayout5.ll
@@ -2,8 +2,7 @@
 ;
 ; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s
 
-target datalayout = "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; CHECK: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-pc-linux-gnu"
 
+; CHECK: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
diff --git a/llvm/test/CodeGen/X86/AMX/amx-config.ll b/llvm/test/CodeGen/X86/AMX/amx-config.ll
--- a/llvm/test/CodeGen/X86/AMX/amx-config.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-config.ll
@@ -79,10 +79,10 @@
 ; AVX1-LABEL: test_api:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    movw %dx, -{{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -186,8 +186,8 @@
 ; CHECK-NEXT:    pushl %ebx
 ; CHECK-NEXT:    pushl %edi
 ; CHECK-NEXT:    pushl %esi
-; CHECK-NEXT:    andl $-8, %esp
-; CHECK-NEXT:    subl $32, %esp
+; CHECK-NEXT:    andl $-16, %esp
+; CHECK-NEXT:    subl $48, %esp
 ; CHECK-NEXT:    movl 12(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl 16(%ebp), %ebx
diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll
--- a/llvm/test/CodeGen/X86/atomic-idempotent.ll
+++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll
@@ -182,12 +182,10 @@
 ; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
-; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    andl $-8, %esp
-; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    .cfi_offset %esi, -16
-; X86-SSE2-NEXT:    .cfi_offset %edi, -12
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $32, %esp
+; X86-SSE2-NEXT:    .cfi_offset %esi, -12
 ; X86-SSE2-NEXT:    movl 8(%ebp), %esi
 ; X86-SSE2-NEXT:    movl %esp, %eax
 ; X86-SSE2-NEXT:    pushl $0
@@ -198,18 +196,11 @@
 ; X86-SSE2-NEXT:    pushl %eax
 ; X86-SSE2-NEXT:    calll __sync_fetch_and_or_16
 ; X86-SSE2-NEXT:    addl $20, %esp
-; X86-SSE2-NEXT:    movl (%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    movl %edi, 8(%esi)
-; X86-SSE2-NEXT:    movl %edx, 12(%esi)
-; X86-SSE2-NEXT:    movl %eax, (%esi)
-; X86-SSE2-NEXT:    movl %ecx, 4(%esi)
+; X86-SSE2-NEXT:    movaps (%esp), %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, (%esi)
 ; X86-SSE2-NEXT:    movl %esi, %eax
-; X86-SSE2-NEXT:    leal -8(%ebp), %esp
+; X86-SSE2-NEXT:    leal -4(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
-; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-SSE2-NEXT:    retl $4
@@ -223,7 +214,7 @@
 ; X86-SLM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-SLM-NEXT:    pushl %edi
 ; X86-SLM-NEXT:    pushl %esi
-; X86-SLM-NEXT:    andl $-8, %esp
+; X86-SLM-NEXT:    andl $-16, %esp
 ; X86-SLM-NEXT:    subl $16, %esp
 ; X86-SLM-NEXT:    .cfi_offset %esi, -16
 ; X86-SLM-NEXT:    .cfi_offset %edi, -12
@@ -263,7 +254,7 @@
 ; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-ATOM-NEXT:    pushl %edi
 ; X86-ATOM-NEXT:    pushl %esi
-; X86-ATOM-NEXT:    andl $-8, %esp
+; X86-ATOM-NEXT:    andl $-16, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
 ; X86-ATOM-NEXT:    .cfi_offset %esi, -16
 ; X86-ATOM-NEXT:    .cfi_offset %edi, -12
@@ -528,8 +519,8 @@
 ; X86-SSE2-NEXT:    .cfi_offset %ebp, -8
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    .cfi_def_cfa_register %ebp
-; X86-SSE2-NEXT:    andl $-8, %esp
-; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $32, %esp
 ; X86-SSE2-NEXT:    movl %esp, %eax
 ; X86-SSE2-NEXT:    pushl $0
 ; X86-SSE2-NEXT:    pushl $0
@@ -551,8 +542,8 @@
 ; X86-SLM-NEXT:    .cfi_offset %ebp, -8
 ; X86-SLM-NEXT:    movl %esp, %ebp
 ; X86-SLM-NEXT:    .cfi_def_cfa_register %ebp
-; X86-SLM-NEXT:    andl $-8, %esp
-; X86-SLM-NEXT:    subl $16, %esp
+; X86-SLM-NEXT:    andl $-16, %esp
+; X86-SLM-NEXT:    subl $32, %esp
 ; X86-SLM-NEXT:    movl 8(%ebp), %eax
 ; X86-SLM-NEXT:    movl %esp, %ecx
 ; X86-SLM-NEXT:    pushl $0
@@ -575,7 +566,7 @@
 ; X86-ATOM-NEXT:    .cfi_offset %ebp, -8
 ; X86-ATOM-NEXT:    leal (%esp), %ebp
 ; X86-ATOM-NEXT:    .cfi_def_cfa_register %ebp
-; X86-ATOM-NEXT:    andl $-8, %esp
+; X86-ATOM-NEXT:    andl $-16, %esp
 ; X86-ATOM-NEXT:    leal -{{[0-9]+}}(%esp), %esp
 ; X86-ATOM-NEXT:    movl 8(%ebp), %eax
 ; X86-ATOM-NEXT:    movl %esp, %ecx
diff --git a/llvm/test/CodeGen/X86/atomic-non-integer.ll b/llvm/test/CodeGen/X86/atomic-non-integer.ll
--- a/llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ b/llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -157,8 +157,8 @@
 ;
 ; X86-AVX-LABEL: store_fp128:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    subl $44, %esp
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 48
+; X86-AVX-NEXT:    subl $60, %esp
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 64
 ; X86-AVX-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -166,7 +166,7 @@
 ; X86-AVX-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    movl %eax, (%esp)
 ; X86-AVX-NEXT:    calll __sync_lock_test_and_set_16
-; X86-AVX-NEXT:    addl $40, %esp
+; X86-AVX-NEXT:    addl $56, %esp
 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
 ; X86-AVX-NEXT:    retl
 ;
@@ -394,67 +394,111 @@
 }
 
 define fp128 @load_fp128(ptr %fptr) {
-; X86-SSE-LABEL: load_fp128:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %edi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE-NEXT:    subl $20, %esp
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 32
-; X86-SSE-NEXT:    .cfi_offset %esi, -12
-; X86-SSE-NEXT:    .cfi_offset %edi, -8
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    subl $8, %esp
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 8
-; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    calll __sync_val_compare_and_swap_16
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset -4
-; X86-SSE-NEXT:    addl $44, %esp
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset -44
-; X86-SSE-NEXT:    movl (%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 12(%esi)
-; X86-SSE-NEXT:    movl %eax, (%esi)
-; X86-SSE-NEXT:    movl %ecx, 4(%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $20, %esp
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    popl %edi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
-; X86-SSE-NEXT:    retl $4
+; X86-SSE1-LABEL: load_fp128:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %edi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    pushl %esi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X86-SSE1-NEXT:    subl $20, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 32
+; X86-SSE1-NEXT:    .cfi_offset %esi, -12
+; X86-SSE1-NEXT:    .cfi_offset %edi, -8
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE1-NEXT:    subl $8, %esp
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 8
+; X86-SSE1-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    pushl $0
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE1-NEXT:    pushl $0
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE1-NEXT:    pushl $0
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE1-NEXT:    pushl $0
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE1-NEXT:    pushl $0
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE1-NEXT:    pushl $0
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE1-NEXT:    pushl $0
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE1-NEXT:    pushl $0
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE1-NEXT:    calll __sync_val_compare_and_swap_16
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-SSE1-NEXT:    addl $44, %esp
+; X86-SSE1-NEXT:    .cfi_adjust_cfa_offset -44
+; X86-SSE1-NEXT:    movl (%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SSE1-NEXT:    movl %edi, 8(%esi)
+; X86-SSE1-NEXT:    movl %edx, 12(%esi)
+; X86-SSE1-NEXT:    movl %eax, (%esi)
+; X86-SSE1-NEXT:    movl %ecx, 4(%esi)
+; X86-SSE1-NEXT:    movl %esi, %eax
+; X86-SSE1-NEXT:    addl $20, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X86-SSE1-NEXT:    popl %esi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    popl %edi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT:    retl $4
+;
+; X86-SSE2-LABEL: load_fp128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT:    subl $24, %esp
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT:    .cfi_offset %esi, -8
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT:    subl $8, %esp
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 8
+; X86-SSE2-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE2-NEXT:    pushl $0
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE2-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE2-NEXT:    pushl %eax
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset 4
+; X86-SSE2-NEXT:    calll __sync_val_compare_and_swap_16
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset -4
+; X86-SSE2-NEXT:    addl $44, %esp
+; X86-SSE2-NEXT:    .cfi_adjust_cfa_offset -44
+; X86-SSE2-NEXT:    movaps (%esp), %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, (%esi)
+; X86-SSE2-NEXT:    movl %esi, %eax
+; X86-SSE2-NEXT:    addl $24, %esp
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-AVX-LABEL: load_fp128:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    pushl %esi
 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    subl $56, %esp
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 64
+; X86-AVX-NEXT:    subl $72, %esp
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 80
 ; X86-AVX-NEXT:    .cfi_offset %esi, -8
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -466,10 +510,10 @@
 ; X86-AVX-NEXT:    vzeroupper
 ; X86-AVX-NEXT:    calll __sync_val_compare_and_swap_16
 ; X86-AVX-NEXT:    subl $4, %esp
-; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
 ; X86-AVX-NEXT:    vmovaps %xmm0, (%esi)
 ; X86-AVX-NEXT:    movl %esi, %eax
-; X86-AVX-NEXT:    addl $56, %esp
+; X86-AVX-NEXT:    addl $72, %esp
 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
 ; X86-AVX-NEXT:    popl %esi
 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -322,12 +322,12 @@
 ; CHECK-O0-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-O0-NEXT:    movq %rdi, %rax
 ; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-O0-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-O0-NEXT:    movq %rdi, (%rsp) # 8-byte Spill
 ; CHECK-O0-NEXT:    movl $32, %edi
 ; CHECK-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; CHECK-O0-NEXT:    xorl %ecx, %ecx
 ; CHECK-O0-NEXT:    callq __atomic_load@PLT
-; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; CHECK-O0-NEXT:    movq (%rsp), %rdi # 8-byte Reload
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; CHECK-O0-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
@@ -373,12 +373,12 @@
 ; CHECK-O0-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-O0-NEXT:    movq %rsi, %rax
 ; CHECK-O0-NEXT:    movq %rdi, %rsi
-; CHECK-O0-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-O0-NEXT:    movq %rax, (%rsp)
 ; CHECK-O0-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; CHECK-O0-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; CHECK-O0-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
 ; CHECK-O0-NEXT:    movl $32, %edi
-; CHECK-O0-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; CHECK-O0-NEXT:    movq %rsp, %rdx
 ; CHECK-O0-NEXT:    xorl %ecx, %ecx
 ; CHECK-O0-NEXT:    callq __atomic_store@PLT
 ; CHECK-O0-NEXT:    addq $40, %rsp
@@ -393,8 +393,8 @@
 ; CHECK-O3-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
 ; CHECK-O3-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; CHECK-O3-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; CHECK-O3-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
-; CHECK-O3-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; CHECK-O3-NEXT:    movq %rsi, (%rsp)
+; CHECK-O3-NEXT:    movq %rsp, %rdx
 ; CHECK-O3-NEXT:    movl $32, %edi
 ; CHECK-O3-NEXT:    movq %rax, %rsi
 ; CHECK-O3-NEXT:    xorl %ecx, %ecx
diff --git a/llvm/test/CodeGen/X86/atomic-xor.ll b/llvm/test/CodeGen/X86/atomic-xor.ll
--- a/llvm/test/CodeGen/X86/atomic-xor.ll
+++ b/llvm/test/CodeGen/X86/atomic-xor.ll
@@ -24,7 +24,7 @@
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    movl %esp, %eax
diff --git a/llvm/test/CodeGen/X86/atomic128.ll b/llvm/test/CodeGen/X86/atomic128.ll
--- a/llvm/test/CodeGen/X86/atomic128.ll
+++ b/llvm/test/CodeGen/X86/atomic128.ll
@@ -169,7 +169,7 @@
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK32-NEXT:    addl $28, %esp
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -28
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl (%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -237,7 +237,7 @@
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK32-NEXT:    addl $28, %esp
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -28
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl (%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -305,7 +305,7 @@
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK32-NEXT:    addl $28, %esp
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -28
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl (%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -373,7 +373,7 @@
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK32-NEXT:    addl $28, %esp
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -28
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl (%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -444,7 +444,7 @@
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK32-NEXT:    addl $28, %esp
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -28
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl (%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -515,7 +515,7 @@
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK32-NEXT:    addl $28, %esp
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -28
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl (%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -586,7 +586,7 @@
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK32-NEXT:    addl $28, %esp
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -28
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl (%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -657,7 +657,7 @@
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK32-NEXT:    addl $28, %esp
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -28
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl (%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
@@ -804,8 +804,8 @@
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    .cfi_offset %esi, -12
 ; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    vmovsh 12(%ebp), %xmm0
@@ -814,8 +814,8 @@
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixhfti
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    vmovups %xmm0, (%esi)
+; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
@@ -907,8 +907,8 @@
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    .cfi_offset %esi, -12
 ; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    vmovsh 12(%ebp), %xmm0
@@ -917,8 +917,8 @@
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunshfti
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
-; X86-NEXT:    vmovups %xmm0, (%esi)
+; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
@@ -987,8 +987,8 @@
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    movl 8(%ebp), %esi
 ; X86-NEXT:    vmovsh 12(%ebp), %xmm0
 ; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
@@ -997,7 +997,7 @@
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __extendsftf2
 ; X86-NEXT:    subl $4, %esp
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    vmovaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    leal -4(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/bitcast-i256.ll b/llvm/test/CodeGen/X86/bitcast-i256.ll
--- a/llvm/test/CodeGen/X86/bitcast-i256.ll
+++ b/llvm/test/CodeGen/X86/bitcast-i256.ll
@@ -14,7 +14,7 @@
 ; SLOW:       # %bb.0:
 ; SLOW-NEXT:    movq %rdi, %rax
 ; SLOW-NEXT:    vextractf128 $1, %ymm0, 16(%rdi)
-; SLOW-NEXT:    vmovups %xmm0, (%rdi)
+; SLOW-NEXT:    vmovaps %xmm0, (%rdi)
 ; SLOW-NEXT:    vzeroupper
 ; SLOW-NEXT:    retq
   %r = bitcast <8 x i32> %a to i256
diff --git a/llvm/test/CodeGen/X86/catchpad-dynamic-alloca.ll b/llvm/test/CodeGen/X86/catchpad-dynamic-alloca.ll
--- a/llvm/test/CodeGen/X86/catchpad-dynamic-alloca.ll
+++ b/llvm/test/CodeGen/X86/catchpad-dynamic-alloca.ll
@@ -62,4 +62,4 @@
 ; CHECK-LABEL: $handlerMap$0$test2:
 ; CHECK:      .long   0
 ; CHECK-NEXT: .long   0
-; CHECK-NEXT: .long   8
+; CHECK-NEXT: .long   16
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -1145,9 +1145,8 @@
 ;
 ; X86-SSE-LABEL: f20s128:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $36, %esp
+; X86-SSE-NEXT:    subl $40, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
@@ -1155,18 +1154,11 @@
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixdfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 12(%esi)
-; X86-SSE-NEXT:    movl %eax, (%esi)
-; X86-SSE-NEXT:    movl %ecx, 4(%esi)
+; X86-SSE-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movaps %xmm0, (%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $36, %esp
+; X86-SSE-NEXT:    addl $40, %esp
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    retl $4
 ;
 ; SSE-LABEL: f20s128:
@@ -1490,9 +1482,8 @@
 ;
 ; X86-SSE-LABEL: f20u128:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $36, %esp
+; X86-SSE-NEXT:    subl $40, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
@@ -1500,18 +1491,11 @@
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll __fixunsdfti
 ; X86-SSE-NEXT:    subl $4, %esp
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 12(%esi)
-; X86-SSE-NEXT:    movl %eax, (%esi)
-; X86-SSE-NEXT:    movl %ecx, 4(%esi)
+; X86-SSE-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movaps %xmm0, (%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $36, %esp
+; X86-SSE-NEXT:    addl $40, %esp
 ; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    retl $4
 ;
 ; SSE-LABEL: f20u128:
diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -37,7 +37,7 @@
 ; X86-LABEL: TestFPExtF16_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    movzwl vf16, %eax
 ; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __gnu_h2f_ieee
@@ -55,7 +55,7 @@
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $40, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
@@ -87,7 +87,7 @@
 ; X86-LABEL: TestFPExtF32_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    flds vf32
 ; X86-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-NEXT:    wait
@@ -103,7 +103,7 @@
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $40, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -34,7 +34,7 @@
 ; X32-LABEL: TestFPExtF32_F128:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    pushl %esi
-; X32-NEXT:    subl $24, %esp
+; X32-NEXT:    subl $40, %esp
 ; X32-NEXT:    flds vf32
 ; X32-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -49,7 +49,7 @@
 ; X32-NEXT:    movl %edx, vf128+8
 ; X32-NEXT:    movl %ecx, vf128+4
 ; X32-NEXT:    movl %eax, vf128
-; X32-NEXT:    addl $24, %esp
+; X32-NEXT:    addl $40, %esp
 ; X32-NEXT:    popl %esi
 ; X32-NEXT:    retl
 ;
@@ -424,7 +424,7 @@
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __fixtfti
 ; X32-NEXT:    addl $28, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -475,7 +475,7 @@
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __fixunstfti
 ; X32-NEXT:    addl $28, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -638,7 +638,7 @@
 ; X32-NEXT:    pushl %ecx
 ; X32-NEXT:    calll __floatsitf
 ; X32-NEXT:    addl $12, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -686,7 +686,7 @@
 ; X32-NEXT:    pushl %ecx
 ; X32-NEXT:    calll __floatunsitf
 ; X32-NEXT:    addl $12, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -732,7 +732,7 @@
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __floatsitf
 ; X32-NEXT:    addl $12, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -778,7 +778,7 @@
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __floatunsitf
 ; X32-NEXT:    addl $12, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -825,7 +825,7 @@
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __floatditf
 ; X32-NEXT:    addl $12, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -872,7 +872,7 @@
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __floatunditf
 ; X32-NEXT:    addl $12, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -922,7 +922,7 @@
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __floattitf
 ; X32-NEXT:    addl $28, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -973,7 +973,7 @@
 ; X32-NEXT:    pushl %eax
 ; X32-NEXT:    calll __floatuntitf
 ; X32-NEXT:    addl $28, %esp
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl (%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1303,7 +1303,7 @@
 ; X32-NEXT:    addl $12, %esp
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl (%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X32-NEXT:  .LBB26_4: # %cleanup
 ; X32-NEXT:    movl %edx, (%esi)
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -21,9 +21,8 @@
 ;
 ; X86-LABEL: add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -38,18 +37,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __addtf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %add = call fp128 @llvm.experimental.constrained.fadd.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -66,9 +58,8 @@
 ;
 ; X86-LABEL: sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -83,18 +74,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __subtf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %sub = call fp128 @llvm.experimental.constrained.fsub.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -111,9 +95,8 @@
 ;
 ; X86-LABEL: mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -128,18 +111,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __multf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %mul = call fp128 @llvm.experimental.constrained.fmul.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -156,9 +132,8 @@
 ;
 ; X86-LABEL: div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -173,18 +148,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __divtf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %div = call fp128 @llvm.experimental.constrained.fdiv.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -201,9 +169,8 @@
 ;
 ; X86-LABEL: fma:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -222,18 +189,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll fmal
 ; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %fma = call fp128 @llvm.experimental.constrained.fma.f128(fp128 %x, fp128 %y,  fp128 %z, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -250,9 +210,8 @@
 ;
 ; X86-LABEL: frem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -267,18 +226,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll fmodl
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %div = call fp128 @llvm.experimental.constrained.frem.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -295,9 +247,8 @@
 ;
 ; X86-LABEL: ceil:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -308,18 +259,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll ceill
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %ceil = call fp128 @llvm.experimental.constrained.ceil.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -336,9 +280,8 @@
 ;
 ; X86-LABEL: cos:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -349,18 +292,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll cosl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %cos = call fp128 @llvm.experimental.constrained.cos.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -377,9 +313,8 @@
 ;
 ; X86-LABEL: exp:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -390,18 +325,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll expl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %exp = call fp128 @llvm.experimental.constrained.exp.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -418,9 +346,8 @@
 ;
 ; X86-LABEL: exp2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -431,18 +358,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll exp2l
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %exp2 = call fp128 @llvm.experimental.constrained.exp2.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -459,9 +379,8 @@
 ;
 ; X86-LABEL: floor:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -472,18 +391,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll floorl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %floor = call fp128 @llvm.experimental.constrained.floor.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -500,9 +412,8 @@
 ;
 ; X86-LABEL: log:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -513,18 +424,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll logl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %log = call fp128 @llvm.experimental.constrained.log.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -541,9 +445,8 @@
 ;
 ; X86-LABEL: log10:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -554,18 +457,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll log10l
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %log10 = call fp128 @llvm.experimental.constrained.log10.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -582,9 +478,8 @@
 ;
 ; X86-LABEL: log2:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -595,18 +490,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll log2l
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %log2 = call fp128 @llvm.experimental.constrained.log2.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -623,9 +511,8 @@
 ;
 ; X86-LABEL: maxnum:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -640,18 +527,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll fmaxl
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %maxnum = call fp128 @llvm.experimental.constrained.maxnum.f128(fp128 %x, fp128 %y, metadata !"fpexcept.strict") #0
@@ -668,9 +548,8 @@
 ;
 ; X86-LABEL: minnum:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -685,18 +564,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll fminl
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %minnum = call fp128 @llvm.experimental.constrained.minnum.f128(fp128 %x, fp128 %y, metadata !"fpexcept.strict") #0
@@ -713,9 +585,8 @@
 ;
 ; X86-LABEL: nearbyint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -726,18 +597,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll nearbyintl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %nearbyint = call fp128 @llvm.experimental.constrained.nearbyint.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -754,9 +618,8 @@
 ;
 ; X86-LABEL: pow:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -771,18 +634,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll powl
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %pow = call fp128 @llvm.experimental.constrained.pow.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -799,9 +655,8 @@
 ;
 ; X86-LABEL: powi:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $8, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -813,18 +668,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __powitf2
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %powi = call fp128 @llvm.experimental.constrained.powi.f128(fp128 %x, i32 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -841,9 +689,8 @@
 ;
 ; X86-LABEL: rint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -854,18 +701,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll rintl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %rint = call fp128 @llvm.experimental.constrained.rint.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -882,9 +722,8 @@
 ;
 ; X86-LABEL: round:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -895,18 +734,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll roundl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %round = call fp128 @llvm.experimental.constrained.round.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -923,9 +755,8 @@
 ;
 ; X86-LABEL: roundeven:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -936,18 +767,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll roundevenl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %roundeven = call fp128 @llvm.experimental.constrained.roundeven.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -964,9 +788,8 @@
 ;
 ; X86-LABEL: sin:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -977,18 +800,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll sinl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %sin = call fp128 @llvm.experimental.constrained.sin.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -1005,9 +821,8 @@
 ;
 ; X86-LABEL: sqrt:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1018,18 +833,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll sqrtl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %sqrt = call fp128 @llvm.experimental.constrained.sqrt.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -1046,9 +854,8 @@
 ;
 ; X86-LABEL: trunc:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -1059,18 +866,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll truncl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 8(%esi)
-; X86-NEXT:    movl %edx, 12(%esi)
-; X86-NEXT:    movl %eax, (%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %trunc = call fp128 @llvm.experimental.constrained.trunc.f128(fp128 %x, metadata !"fpexcept.strict") #0
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -22,8 +22,7 @@
 ;
 ; X86-LABEL: Test128Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -36,16 +35,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __addtf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %add = fadd fp128 %d1, %d2
@@ -66,8 +58,7 @@
 ;
 ; X86-LABEL: Test128_1Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -80,16 +71,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __addtf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+8
-; X86-NEXT:    movl %edx, vf128+12
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %0 = load fp128, ptr @vf128, align 16
@@ -109,8 +93,7 @@
 ;
 ; X86-LABEL: Test128Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -123,16 +106,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __subtf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %sub = fsub fp128 %d1, %d2
@@ -153,8 +129,7 @@
 ;
 ; X86-LABEL: Test128_1Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -167,16 +142,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __subtf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+8
-; X86-NEXT:    movl %edx, vf128+12
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %0 = load fp128, ptr @vf128, align 16
@@ -196,8 +164,7 @@
 ;
 ; X86-LABEL: Test128Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -210,16 +177,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __multf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %mul = fmul fp128 %d1, %d2
@@ -240,8 +200,7 @@
 ;
 ; X86-LABEL: Test128_1Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -254,16 +213,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __multf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+8
-; X86-NEXT:    movl %edx, vf128+12
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %0 = load fp128, ptr @vf128, align 16
@@ -283,8 +235,7 @@
 ;
 ; X86-LABEL: Test128Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -297,16 +248,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __divtf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %div = fdiv fp128 %d1, %d2
@@ -327,8 +271,7 @@
 ;
 ; X86-LABEL: Test128_1Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -341,16 +284,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __divtf3
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+8
-; X86-NEXT:    movl %edx, vf128+12
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %0 = load fp128, ptr @vf128, align 16
@@ -370,8 +306,7 @@
 ;
 ; X86-LABEL: Test128Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -384,16 +319,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll fmodl
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %div = frem fp128 %d1, %d2
@@ -414,8 +342,7 @@
 ;
 ; X86-LABEL: Test128_1Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -428,16 +355,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll fmodl
 ; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+8
-; X86-NEXT:    movl %edx, vf128+12
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %0 = load fp128, ptr @vf128, align 16
@@ -457,8 +377,7 @@
 ;
 ; X86-LABEL: Test128Sqrt:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -467,16 +386,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll sqrtl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %sqrt = call fp128 @llvm.sqrt.f128(fp128 %d1)
@@ -496,8 +408,7 @@
 ;
 ; X86-LABEL: Test128Sin:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -506,16 +417,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll sinl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %sqrt = call fp128 @llvm.sin.f128(fp128 %d1)
@@ -535,8 +439,7 @@
 ;
 ; X86-LABEL: Test128Cos:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -545,16 +448,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll cosl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %sqrt = call fp128 @llvm.cos.f128(fp128 %d1)
@@ -574,8 +470,7 @@
 ;
 ; X86-LABEL: Test128Ceil:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -584,16 +479,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll ceill
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %sqrt = call fp128 @llvm.ceil.f128(fp128 %d1)
@@ -613,8 +501,7 @@
 ;
 ; X86-LABEL: Test128Floor:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -623,16 +510,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll floorl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %sqrt = call fp128 @llvm.floor.f128(fp128 %d1)
@@ -652,8 +532,7 @@
 ;
 ; X86-LABEL: Test128Trunc:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -662,16 +541,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll truncl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %sqrt = call fp128 @llvm.trunc.f128(fp128 %d1)
@@ -691,8 +563,7 @@
 ;
 ; X86-LABEL: Test128Nearbyint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -701,16 +572,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll nearbyintl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %sqrt = call fp128 @llvm.nearbyint.f128(fp128 %d1)
@@ -730,8 +594,7 @@
 ;
 ; X86-LABEL: Test128Rint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -740,16 +603,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll rintl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %sqrt = call fp128 @llvm.rint.f128(fp128 %d1)
@@ -769,8 +625,7 @@
 ;
 ; X86-LABEL: Test128Round:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $40, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -779,16 +634,9 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll roundl
 ; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, vf128+12
-; X86-NEXT:    movl %edx, vf128+8
-; X86-NEXT:    movl %ecx, vf128+4
-; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, vf128
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    retl
 entry:
   %sqrt = call fp128 @llvm.round.f128(fp128 %d1)
@@ -804,9 +652,8 @@
 ;
 ; X86-LABEL: Test128FMA:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $24, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -825,18 +672,11 @@
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll fmal
 ; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movl (%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, 12(%esi)
-; X86-NEXT:    movl %edx, 8(%esi)
-; X86-NEXT:    movl %ecx, 4(%esi)
-; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $24, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 entry:
   %call = call fp128 @llvm.fma.f128(fp128 %a, fp128 %b, fp128 %c)
diff --git a/llvm/test/CodeGen/X86/fpenv-combine.ll b/llvm/test/CodeGen/X86/fpenv-combine.ll
--- a/llvm/test/CodeGen/X86/fpenv-combine.ll
+++ b/llvm/test/CodeGen/X86/fpenv-combine.ll
@@ -15,19 +15,19 @@
 ; X64-NEXT:    subq $40, %rsp
 ; X64-NEXT:    movq %rsi, %rbx
 ; X64-NEXT:    movq %rdi, %r14
-; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rsp, %rdi
 ; X64-NEXT:    callq fegetenv@PLT
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq (%rsp), %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT:    movq %rsi, 24(%r14)
+; X64-NEXT:    movq %rsi, 16(%r14)
 ; X64-NEXT:    movq %rcx, (%r14)
+; X64-NEXT:    movq %rax, 24(%r14)
 ; X64-NEXT:    movq %rdx, 8(%r14)
-; X64-NEXT:    movq %rax, 16(%r14)
-; X64-NEXT:    movq %rax, 16(%rbx)
-; X64-NEXT:    movq %rsi, 24(%rbx)
+; X64-NEXT:    movq %rsi, 16(%rbx)
 ; X64-NEXT:    movq %rcx, (%rbx)
+; X64-NEXT:    movq %rax, 24(%rbx)
 ; X64-NEXT:    movq %rdx, 8(%rbx)
 ; X64-NEXT:    addq $40, %rsp
 ; X64-NEXT:    popq %rbx
@@ -72,9 +72,9 @@
 ; X64-NEXT:    movq (%rsp), %rax
 ; X64-NEXT:    andl $1, %eax
 ; X64-NEXT:    movq %rax, (%rbx)
-; X64-NEXT:    movq $0, 16(%rbx)
 ; X64-NEXT:    movq $0, 24(%rbx)
 ; X64-NEXT:    movq $0, 8(%rbx)
+; X64-NEXT:    movq $0, 16(%rbx)
 ; X64-NEXT:    addq $32, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
@@ -94,9 +94,9 @@
 ; X64-NEXT:    subq $40, %rsp
 ; X64-NEXT:    movq %rsi, %rbx
 ; X64-NEXT:    movq %rdi, %r14
-; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rsp, %rdi
 ; X64-NEXT:    callq fegetenv@PLT
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq (%rsp), %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
@@ -129,8 +129,8 @@
 ; X64-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
-; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rax, (%rsp)
+; X64-NEXT:    movq %rsp, %rdi
 ; X64-NEXT:    callq fesetenv@PLT
 ; X64-NEXT:    addq $40, %rsp
 ; X64-NEXT:    retq
@@ -182,11 +182,11 @@
 ; X64-NEXT:    subq $40, %rsp
 ; X64-NEXT:    movq (%rdi), %rax
 ; X64-NEXT:    andl $1, %eax
-; X64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rax, (%rsp)
 ; X64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
 ; X64-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; X64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq %rsp, %rdi
 ; X64-NEXT:    callq fesetenv@PLT
 ; X64-NEXT:    addq $40, %rsp
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll
--- a/llvm/test/CodeGen/X86/fpenv.ll
+++ b/llvm/test/CodeGen/X86/fpenv.ll
@@ -252,20 +252,20 @@
 define void @get_fpenv_01(ptr %ptr) #0 {
 ; X86-NOSSE-LABEL: get_fpenv_01:
 ; X86-NOSSE:       # %bb.0: # %entry
-; X86-NOSSE-NEXT:    subl $44, %esp
+; X86-NOSSE-NEXT:    subl $60, %esp
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl %eax, (%esp)
 ; X86-NOSSE-NEXT:    calll fegetenv
-; X86-NOSSE-NEXT:    addl $44, %esp
+; X86-NOSSE-NEXT:    addl $60, %esp
 ; X86-NOSSE-NEXT:    retl
 ;
 ; X86-SSE-LABEL: get_fpenv_01:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    subl $44, %esp
+; X86-SSE-NEXT:    subl $60, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll fegetenv
-; X86-SSE-NEXT:    addl $44, %esp
+; X86-SSE-NEXT:    addl $60, %esp
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: get_fpenv_01:
@@ -283,21 +283,21 @@
 define void @get_fpenv_01_native(ptr %ptr) nounwind {
 ; X86-NOSSE-LABEL: get_fpenv_01_native:
 ; X86-NOSSE:       # %bb.0: # %entry
-; X86-NOSSE-NEXT:    subl $36, %esp
+; X86-NOSSE-NEXT:    subl $44, %esp
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    fnstenv (%eax)
 ; X86-NOSSE-NEXT:    fldenv (%eax)
-; X86-NOSSE-NEXT:    addl $36, %esp
+; X86-NOSSE-NEXT:    addl $44, %esp
 ; X86-NOSSE-NEXT:    retl
 ;
 ; X86-SSE-LABEL: get_fpenv_01_native:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    subl $36, %esp
+; X86-SSE-NEXT:    subl $44, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    fnstenv (%eax)
 ; X86-SSE-NEXT:    fldenv (%eax)
 ; X86-SSE-NEXT:    stmxcsr 28(%eax)
-; X86-SSE-NEXT:    addl $36, %esp
+; X86-SSE-NEXT:    addl $44, %esp
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: get_fpenv_01_native:
@@ -315,20 +315,20 @@
 define void @set_fpenv_01(ptr %ptr) #0 {
 ; X86-NOSSE-LABEL: set_fpenv_01:
 ; X86-NOSSE:       # %bb.0: # %entry
-; X86-NOSSE-NEXT:    subl $44, %esp
+; X86-NOSSE-NEXT:    subl $60, %esp
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl %eax, (%esp)
 ; X86-NOSSE-NEXT:    calll fesetenv
-; X86-NOSSE-NEXT:    addl $44, %esp
+; X86-NOSSE-NEXT:    addl $60, %esp
 ; X86-NOSSE-NEXT:    retl
 ;
 ; X86-SSE-LABEL: set_fpenv_01:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    subl $44, %esp
+; X86-SSE-NEXT:    subl $60, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    movl %eax, (%esp)
 ; X86-SSE-NEXT:    calll fesetenv
-; X86-SSE-NEXT:    addl $44, %esp
+; X86-SSE-NEXT:    addl $60, %esp
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: set_fpenv_01:
@@ -346,19 +346,19 @@
 define void @set_fpenv_01_native(ptr %ptr) nounwind {
 ; X86-NOSSE-LABEL: set_fpenv_01_native:
 ; X86-NOSSE:       # %bb.0: # %entry
-; X86-NOSSE-NEXT:    subl $36, %esp
+; X86-NOSSE-NEXT:    subl $44, %esp
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    fldenv (%eax)
-; X86-NOSSE-NEXT:    addl $36, %esp
+; X86-NOSSE-NEXT:    addl $44, %esp
 ; X86-NOSSE-NEXT:    retl
 ;
 ; X86-SSE-LABEL: set_fpenv_01_native:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    subl $36, %esp
+; X86-SSE-NEXT:    subl $44, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE-NEXT:    fldenv (%eax)
 ; X86-SSE-NEXT:    ldmxcsr 28(%eax)
-; X86-SSE-NEXT:    addl $36, %esp
+; X86-SSE-NEXT:    addl $44, %esp
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-LABEL: set_fpenv_01_native:
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
--- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
@@ -690,7 +690,7 @@
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $44, %esp
+; X86-X87-NEXT:    subl $60, %esp
 ; X86-X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fsts {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -766,7 +766,7 @@
 ; X86-X87-NEXT:    andl $15, %edx
 ; X86-X87-NEXT:    movb %dl, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $44, %esp
+; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
@@ -779,7 +779,7 @@
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $28, %esp
+; X86-SSE-NEXT:    subl $44, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
@@ -819,7 +819,7 @@
 ; X86-SSE-NEXT:    andl $15, %eax
 ; X86-SSE-NEXT:    movb %al, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $28, %esp
+; X86-SSE-NEXT:    addl $44, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
@@ -859,7 +859,7 @@
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $44, %esp
+; X86-X87-NEXT:    subl $60, %esp
 ; X86-X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fsts {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -931,7 +931,7 @@
 ; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-X87-NEXT:    movl %eax, (%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $44, %esp
+; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
@@ -954,7 +954,7 @@
 ; X86-SSE-NEXT:    pushl %ebx
 ; X86-SSE-NEXT:    pushl %edi
 ; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    subl $28, %esp
+; X86-SSE-NEXT:    subl $44, %esp
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
@@ -990,7 +990,7 @@
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %eax, (%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $28, %esp
+; X86-SSE-NEXT:    addl $44, %esp
 ; X86-SSE-NEXT:    popl %esi
 ; X86-SSE-NEXT:    popl %edi
 ; X86-SSE-NEXT:    popl %ebx
@@ -2882,7 +2882,7 @@
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $44, %esp
+; X86-X87-NEXT:    subl $60, %esp
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-X87-NEXT:    movl %eax, (%esp)
 ; X86-X87-NEXT:    calll __gnu_h2f_ieee
@@ -2960,7 +2960,7 @@
 ; X86-X87-NEXT:    andl $15, %edx
 ; X86-X87-NEXT:    movb %dl, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $44, %esp
+; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
@@ -3061,7 +3061,7 @@
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $44, %esp
+; X86-X87-NEXT:    subl $60, %esp
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-X87-NEXT:    movl %eax, (%esp)
 ; X86-X87-NEXT:    calll __gnu_h2f_ieee
@@ -3135,7 +3135,7 @@
 ; X86-X87-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-X87-NEXT:    movl %eax, (%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $44, %esp
+; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
--- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
@@ -807,7 +807,7 @@
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $44, %esp
+; X86-X87-NEXT:    subl $60, %esp
 ; X86-X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    fsts {{[0-9]+}}(%esp)
 ; X86-X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -864,7 +864,7 @@
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %eax, (%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $44, %esp
+; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
@@ -2818,7 +2818,7 @@
 ; X86-X87-NEXT:    pushl %ebx
 ; X86-X87-NEXT:    pushl %edi
 ; X86-X87-NEXT:    pushl %esi
-; X86-X87-NEXT:    subl $44, %esp
+; X86-X87-NEXT:    subl $60, %esp
 ; X86-X87-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-X87-NEXT:    movl %eax, (%esp)
 ; X86-X87-NEXT:    calll __gnu_h2f_ieee
@@ -2877,7 +2877,7 @@
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %eax, (%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
-; X86-X87-NEXT:    addl $44, %esp
+; X86-X87-NEXT:    addl $60, %esp
 ; X86-X87-NEXT:    popl %esi
 ; X86-X87-NEXT:    popl %edi
 ; X86-X87-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/implicit-null-check.ll b/llvm/test/CodeGen/X86/implicit-null-check.ll
--- a/llvm/test/CodeGen/X86/implicit-null-check.ll
+++ b/llvm/test/CodeGen/X86/implicit-null-check.ll
@@ -128,19 +128,15 @@
 ; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:  Ltmp3:
-; CHECK-NEXT:    movq (%rsi), %rcx ## on-fault: LBB5_1
+; CHECK-NEXT:    movaps (%rsi), %xmm0 ## on-fault: LBB5_1
 ; CHECK-NEXT:  ## %bb.2: ## %not_null
-; CHECK-NEXT:    movq 8(%rsi), %rdx
-; CHECK-NEXT:    movq 16(%rsi), %rdi
-; CHECK-NEXT:    movq 24(%rsi), %rsi
-; CHECK-NEXT:    movq %rsi, 24(%rax)
-; CHECK-NEXT:    movq %rdi, 16(%rax)
-; CHECK-NEXT:    movq %rdx, 8(%rax)
-; CHECK-NEXT:    movq %rcx, (%rax)
+; CHECK-NEXT:    movaps 16(%rsi), %xmm1
+; CHECK-NEXT:    movaps %xmm1, 16(%rax)
+; CHECK-NEXT:    movaps %xmm0, (%rax)
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  LBB5_1: ## %is_null
-; CHECK-NEXT:    movq $0, 24(%rax)
-; CHECK-NEXT:    movq $0, 16(%rax)
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movaps %xmm0, 16(%rax)
 ; CHECK-NEXT:    movq $0, 8(%rax)
 ; CHECK-NEXT:    movq $42, (%rax)
 ; CHECK-NEXT:    retq
@@ -622,7 +618,8 @@
 define i64 @imp_null_check_load_shift_add_addr(ptr %x) {
 ; CHECK-LABEL: imp_null_check_load_shift_add_addr:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK:         movq 3526(,%rdi,8), %rax ## on-fault: LBB23_1
+; CHECK-NEXT:  Ltmp19:
+; CHECK-NEXT:    movq 3526(,%rdi,8), %rax ## on-fault: LBB23_1
 ; CHECK-NEXT:  ## %bb.2: ## %not_null
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  LBB23_1: ## %is_null
diff --git a/llvm/test/CodeGen/X86/osx-private-labels.ll b/llvm/test/CodeGen/X86/osx-private-labels.ll
--- a/llvm/test/CodeGen/X86/osx-private-labels.ll
+++ b/llvm/test/CodeGen/X86/osx-private-labels.ll
@@ -36,7 +36,7 @@
 
 @private6 = private unnamed_addr constant i128 42
 ; CHECK: .section	__TEXT,__literal16,16byte_literals
-; CHECK-NEXT: .p2align	3
+; CHECK-NEXT: .p2align	4
 ; CHECK-NEXT: L_private6:
 
 %struct._objc_class = type { ptr }
diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
--- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll
+++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll
@@ -218,6 +218,8 @@
 ; ILP-LABEL: test2:
 ; ILP:       # %bb.0:
 ; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorps %xmm0, %xmm0
+; ILP-NEXT:    movaps %xmm0, 16(%rdi)
 ; ILP-NEXT:    xorl %edi, %edi
 ; ILP-NEXT:    movq %rsi, %r11
 ; ILP-NEXT:    negq %r11
@@ -250,14 +252,14 @@
 ; ILP-NEXT:    orq %rdi, %r9
 ; ILP-NEXT:    cmovneq %rcx, %r8
 ; ILP-NEXT:    movq %r8, (%rax)
-; ILP-NEXT:    movq $0, 24(%rax)
-; ILP-NEXT:    movq $0, 16(%rax)
 ; ILP-NEXT:    movq $0, 8(%rax)
 ; ILP-NEXT:    retq
 ;
 ; HYBRID-LABEL: test2:
 ; HYBRID:       # %bb.0:
 ; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    xorps %xmm0, %xmm0
+; HYBRID-NEXT:    movaps %xmm0, 16(%rdi)
 ; HYBRID-NEXT:    xorl %edi, %edi
 ; HYBRID-NEXT:    movq %rsi, %r11
 ; HYBRID-NEXT:    negq %r11
@@ -290,14 +292,14 @@
 ; HYBRID-NEXT:    orq %rdi, %r9
 ; HYBRID-NEXT:    cmovneq %rcx, %r8
 ; HYBRID-NEXT:    movq %r8, (%rax)
-; HYBRID-NEXT:    movq $0, 24(%rax)
-; HYBRID-NEXT:    movq $0, 16(%rax)
 ; HYBRID-NEXT:    movq $0, 8(%rax)
 ; HYBRID-NEXT:    retq
 ;
 ; BURR-LABEL: test2:
 ; BURR:       # %bb.0:
 ; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    xorps %xmm0, %xmm0
+; BURR-NEXT:    movaps %xmm0, 16(%rdi)
 ; BURR-NEXT:    xorl %edi, %edi
 ; BURR-NEXT:    movq %rsi, %r11
 ; BURR-NEXT:    negq %r11
@@ -330,8 +332,6 @@
 ; BURR-NEXT:    orq %rdi, %r9
 ; BURR-NEXT:    cmovneq %rcx, %r8
 ; BURR-NEXT:    movq %r8, (%rax)
-; BURR-NEXT:    movq $0, 24(%rax)
-; BURR-NEXT:    movq $0, 16(%rax)
 ; BURR-NEXT:    movq $0, 8(%rax)
 ; BURR-NEXT:    retq
 ;
@@ -369,15 +369,17 @@
 ; SRC-NEXT:    subq $-128, %r8
 ; SRC-NEXT:    orq %r9, %rdi
 ; SRC-NEXT:    cmovneq %rdx, %r8
+; SRC-NEXT:    xorps %xmm0, %xmm0
+; SRC-NEXT:    movaps %xmm0, 16(%rax)
 ; SRC-NEXT:    movq %r8, (%rax)
-; SRC-NEXT:    movq $0, 24(%rax)
-; SRC-NEXT:    movq $0, 16(%rax)
 ; SRC-NEXT:    movq $0, 8(%rax)
 ; SRC-NEXT:    retq
 ;
 ; LIN-LABEL: test2:
 ; LIN:       # %bb.0:
 ; LIN-NEXT:    movq %rdi, %rax
+; LIN-NEXT:    xorps %xmm0, %xmm0
+; LIN-NEXT:    movaps %xmm0, 16(%rdi)
 ; LIN-NEXT:    movq %rsi, %rdi
 ; LIN-NEXT:    negq %rdi
 ; LIN-NEXT:    andq %rsi, %rdi
@@ -411,8 +413,6 @@
 ; LIN-NEXT:    cmoveq %rdx, %r8
 ; LIN-NEXT:    movq %r8, (%rax)
 ; LIN-NEXT:    movq $0, 8(%rax)
-; LIN-NEXT:    movq $0, 16(%rax)
-; LIN-NEXT:    movq $0, 24(%rax)
 ; LIN-NEXT:    retq
   %b = sub i256 0, %a
   %c = and i256 %b, %a
@@ -425,6 +425,8 @@
 ; ILP:       # %bb.0:
 ; ILP-NEXT:    pushq %rbx
 ; ILP-NEXT:    movq %rdi, %rax
+; ILP-NEXT:    xorps %xmm0, %xmm0
+; ILP-NEXT:    movaps %xmm0, 16(%rdi)
 ; ILP-NEXT:    xorl %r9d, %r9d
 ; ILP-NEXT:    movq %rsi, %rdi
 ; ILP-NEXT:    negq %rdi
@@ -439,13 +441,13 @@
 ; ILP-NEXT:    notq %rdx
 ; ILP-NEXT:    andq %r10, %rdx
 ; ILP-NEXT:    bsrq %rdx, %r9
-; ILP-NEXT:    notq %rsi
 ; ILP-NEXT:    xorq $63, %rbx
 ; ILP-NEXT:    notq %rcx
 ; ILP-NEXT:    andq %r11, %rcx
 ; ILP-NEXT:    bsrq %rcx, %r10
 ; ILP-NEXT:    xorq $63, %r10
 ; ILP-NEXT:    addq $64, %r10
+; ILP-NEXT:    notq %rsi
 ; ILP-NEXT:    testq %r8, %r8
 ; ILP-NEXT:    cmovneq %rbx, %r10
 ; ILP-NEXT:    xorq $63, %r9
@@ -461,8 +463,6 @@
 ; ILP-NEXT:    orq %r8, %rcx
 ; ILP-NEXT:    cmovneq %r10, %rsi
 ; ILP-NEXT:    movq %rsi, (%rax)
-; ILP-NEXT:    movq $0, 24(%rax)
-; ILP-NEXT:    movq $0, 16(%rax)
 ; ILP-NEXT:    movq $0, 8(%rax)
 ; ILP-NEXT:    popq %rbx
 ; ILP-NEXT:    retq
@@ -471,6 +471,8 @@
 ; HYBRID:       # %bb.0:
 ; HYBRID-NEXT:    pushq %rbx
 ; HYBRID-NEXT:    movq %rdi, %rax
+; HYBRID-NEXT:    xorps %xmm0, %xmm0
+; HYBRID-NEXT:    movaps %xmm0, 16(%rdi)
 ; HYBRID-NEXT:    xorl %r9d, %r9d
 ; HYBRID-NEXT:    movq %rsi, %rdi
 ; HYBRID-NEXT:    negq %rdi
@@ -507,8 +509,6 @@
 ; HYBRID-NEXT:    orq %r8, %rcx
 ; HYBRID-NEXT:    cmovneq %r9, %rsi
 ; HYBRID-NEXT:    movq %rsi, (%rax)
-; HYBRID-NEXT:    movq $0, 24(%rax)
-; HYBRID-NEXT:    movq $0, 16(%rax)
 ; HYBRID-NEXT:    movq $0, 8(%rax)
 ; HYBRID-NEXT:    popq %rbx
 ; HYBRID-NEXT:    retq
@@ -517,6 +517,8 @@
 ; BURR:       # %bb.0:
 ; BURR-NEXT:    pushq %rbx
 ; BURR-NEXT:    movq %rdi, %rax
+; BURR-NEXT:    xorps %xmm0, %xmm0
+; BURR-NEXT:    movaps %xmm0, 16(%rdi)
 ; BURR-NEXT:    xorl %r9d, %r9d
 ; BURR-NEXT:    movq %rsi, %rdi
 ; BURR-NEXT:    negq %rdi
@@ -553,8 +555,6 @@
 ; BURR-NEXT:    orq %r8, %rcx
 ; BURR-NEXT:    cmovneq %r9, %rsi
 ; BURR-NEXT:    movq %rsi, (%rax)
-; BURR-NEXT:    movq $0, 24(%rax)
-; BURR-NEXT:    movq $0, 16(%rax)
 ; BURR-NEXT:    movq $0, 8(%rax)
 ; BURR-NEXT:    popq %rbx
 ; BURR-NEXT:    retq
@@ -597,15 +597,17 @@
 ; SRC-NEXT:    subq $-128, %r10
 ; SRC-NEXT:    orq %rcx, %r8
 ; SRC-NEXT:    cmovneq %r9, %r10
+; SRC-NEXT:    xorps %xmm0, %xmm0
+; SRC-NEXT:    movaps %xmm0, 16(%rax)
 ; SRC-NEXT:    movq %r10, (%rax)
-; SRC-NEXT:    movq $0, 24(%rax)
-; SRC-NEXT:    movq $0, 16(%rax)
 ; SRC-NEXT:    movq $0, 8(%rax)
 ; SRC-NEXT:    retq
 ;
 ; LIN-LABEL: test3:
 ; LIN:       # %bb.0:
 ; LIN-NEXT:    movq %rdi, %rax
+; LIN-NEXT:    xorps %xmm0, %xmm0
+; LIN-NEXT:    movaps %xmm0, 16(%rdi)
 ; LIN-NEXT:    movq %rsi, %rdi
 ; LIN-NEXT:    negq %rdi
 ; LIN-NEXT:    notq %rsi
@@ -643,8 +645,6 @@
 ; LIN-NEXT:    cmoveq %rsi, %rdi
 ; LIN-NEXT:    movq %rdi, (%rax)
 ; LIN-NEXT:    movq $0, 8(%rax)
-; LIN-NEXT:    movq $0, 16(%rax)
-; LIN-NEXT:    movq $0, 24(%rax)
 ; LIN-NEXT:    retq
   %m = sub i256 -1, %n
   %x = sub i256 0, %n
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -306,8 +306,8 @@
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $72, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %ecx
 ; X86-NEXT:    movl 12(%ebp), %eax
 ; X86-NEXT:    movl 20(%ebp), %edx
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -369,8 +369,8 @@
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $88, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $96, %esp
 ; X86-NEXT:    movl 8(%ebp), %ecx
 ; X86-NEXT:    movl 12(%ebp), %eax
 ; X86-NEXT:    movl 20(%ebp), %esi
@@ -805,8 +805,8 @@
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $192, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $208, %esp
 ; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:    movl 16(%ebp), %ebx
 ; X86-NEXT:    movl 32(%ebp), %eax
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -774,13 +774,11 @@
 define i32 @ne_i128_pair(ptr %a, ptr %b) {
 ; SSE2-LABEL: ne_i128_pair:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; SSE2-NEXT:    movdqu (%rsi), %xmm2
-; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    pcmpeqb 16(%rsi), %xmm1
+; SSE2-NEXT:    pcmpeqb (%rsi), %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    pmovmskb %xmm0, %ecx
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
@@ -789,13 +787,11 @@
 ;
 ; SSE41-LABEL: ne_i128_pair:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqu (%rdi), %xmm0
-; SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; SSE41-NEXT:    movdqu (%rsi), %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm2
-; SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    pxor 16(%rsi), %xmm1
+; SSE41-NEXT:    pxor (%rsi), %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    xorl %eax, %eax
 ; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    setne %al
@@ -803,8 +799,8 @@
 ;
 ; AVXANY-LABEL: ne_i128_pair:
 ; AVXANY:       # %bb.0:
-; AVXANY-NEXT:    vmovdqu (%rdi), %xmm0
-; AVXANY-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVXANY-NEXT:    vmovdqa (%rdi), %xmm0
+; AVXANY-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVXANY-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
 ; AVXANY-NEXT:    vpxor (%rsi), %xmm0, %xmm0
 ; AVXANY-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -832,13 +828,11 @@
 define i32 @eq_i128_pair(ptr %a, ptr %b) {
 ; SSE2-LABEL: eq_i128_pair:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu 16(%rdi), %xmm1
-; SSE2-NEXT:    movdqu (%rsi), %xmm2
-; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; SSE2-NEXT:    movdqu 16(%rsi), %xmm0
-; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE2-NEXT:    pcmpeqb 16(%rsi), %xmm1
+; SSE2-NEXT:    pcmpeqb (%rsi), %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    pmovmskb %xmm0, %ecx
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
@@ -847,13 +841,11 @@
 ;
 ; SSE41-LABEL: eq_i128_pair:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqu (%rdi), %xmm0
-; SSE41-NEXT:    movdqu 16(%rdi), %xmm1
-; SSE41-NEXT:    movdqu (%rsi), %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm2
-; SSE41-NEXT:    movdqu 16(%rsi), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    por %xmm2, %xmm0
+; SSE41-NEXT:    movdqa (%rdi), %xmm0
+; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE41-NEXT:    pxor 16(%rsi), %xmm1
+; SSE41-NEXT:    pxor (%rsi), %xmm0
+; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    xorl %eax, %eax
 ; SSE41-NEXT:    ptest %xmm0, %xmm0
 ; SSE41-NEXT:    sete %al
@@ -861,8 +853,8 @@
 ;
 ; AVXANY-LABEL: eq_i128_pair:
 ; AVXANY:       # %bb.0:
-; AVXANY-NEXT:    vmovdqu (%rdi), %xmm0
-; AVXANY-NEXT:    vmovdqu 16(%rdi), %xmm1
+; AVXANY-NEXT:    vmovdqa (%rdi), %xmm0
+; AVXANY-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVXANY-NEXT:    vpxor 16(%rsi), %xmm1, %xmm1
 ; AVXANY-NEXT:    vpxor (%rsi), %xmm0, %xmm0
 ; AVXANY-NEXT:    vpor %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -804,7 +804,7 @@
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movb %cl, 16(%eax)
-; X86-NEXT:    setne 20(%eax)
+; X86-NEXT:    setne 32(%eax)
 ; X86-NEXT:    addl $188, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -990,7 +990,7 @@
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq %rcx, (%rax)
 ; X64-NEXT:    movb %sil, 16(%rax)
-; X64-NEXT:    setne 24(%rax)
+; X64-NEXT:    setne 32(%rax)
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
diff --git a/llvm/test/CodeGen/X86/sret-implicit.ll b/llvm/test/CodeGen/X86/sret-implicit.ll
--- a/llvm/test/CodeGen/X86/sret-implicit.ll
+++ b/llvm/test/CodeGen/X86/sret-implicit.ll
@@ -25,7 +25,8 @@
 
 ; X64-LABEL: sret_demoted
 ; X64-DAG: movq %rdi, %rax
-; X64-DAG: movq $0, (%rdi)
+; X64-DAG: xorps %xmm0, %xmm0
+; X64-DAG: movaps %xmm0, (%rdi)
 ; X64: retq
 
 ; X86-LABEL: sret_demoted
diff --git a/llvm/test/CodeGen/X86/statepoint-deopt-lowering.ll b/llvm/test/CodeGen/X86/statepoint-deopt-lowering.ll
--- a/llvm/test/CodeGen/X86/statepoint-deopt-lowering.ll
+++ b/llvm/test/CodeGen/X86/statepoint-deopt-lowering.ll
@@ -43,35 +43,25 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    subq $248, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 256
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $144, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq $144, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $144, (%rsp)
 ; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq $144, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq $144, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    callq foo@PLT
 ; CHECK-NEXT:  .Ltmp2:
@@ -89,60 +79,36 @@
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    subq $248, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 256
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm4
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm5
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm6
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm7
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
 ; CHECK-NEXT:    movq %r9, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %r8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rdi, (%rsp)
+; CHECK-NEXT:    movaps %xmm11, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm10, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm9, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    callq foo@PLT
 ; CHECK-NEXT:  .Ltmp3:
 ; CHECK-NEXT:    addq $248, %rsp
diff --git a/llvm/test/CodeGen/X86/statepoint-vector.ll b/llvm/test/CodeGen/X86/statepoint-vector.ll
--- a/llvm/test/CodeGen/X86/statepoint-vector.ll
+++ b/llvm/test/CodeGen/X86/statepoint-vector.ll
@@ -122,9 +122,9 @@
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq $-1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $-1, (%rsp)
 ; CHECK-NEXT:    callq do_safepoint@PLT
 ; CHECK-NEXT:  .Ltmp4:
 ; CHECK-NEXT:    addq $40, %rsp
diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll
--- a/llvm/test/CodeGen/X86/udiv_fix.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix.ll
@@ -152,8 +152,8 @@
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -193,8 +193,8 @@
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-8, %esp
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
diff --git a/llvm/test/tools/llvm-lto2/X86/pipeline.ll b/llvm/test/tools/llvm-lto2/X86/pipeline.ll
--- a/llvm/test/tools/llvm-lto2/X86/pipeline.ll
+++ b/llvm/test/tools/llvm-lto2/X86/pipeline.ll
@@ -15,7 +15,7 @@
 ; is accepted).
 ; RUN: llvm-lto2 run %t1.bc -o %t.o -r %t1.bc,patatino,px
 
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @patatino() {
diff --git a/llvm/test/tools/llvm-lto2/X86/slp-vectorize-pm.ll b/llvm/test/tools/llvm-lto2/X86/slp-vectorize-pm.ll
--- a/llvm/test/tools/llvm-lto2/X86/slp-vectorize-pm.ll
+++ b/llvm/test/tools/llvm-lto2/X86/slp-vectorize-pm.ll
@@ -26,7 +26,7 @@
 ; CHECK-O2-LPV: = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-O3-LPV: = !{!"llvm.loop.isvectorized", i32 1}
 
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target datalayout = "e-m:e-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define i32 @foo(ptr %a) {
diff --git a/llvm/test/tools/llvm-lto2/X86/stats-file-option.ll b/llvm/test/tools/llvm-lto2/X86/stats-file-option.ll
--- a/llvm/test/tools/llvm-lto2/X86/stats-file-option.ll
+++ b/llvm/test/tools/llvm-lto2/X86/stats-file-option.ll
@@ -6,7 +6,7 @@
 ; RUN: llvm-lto2 run %t1.bc -o %t.o -r %t1.bc,patatino,px -stats-file=%t2.stats
 ; RUN: FileCheck --input-file=%t2.stats %s
 
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @patatino() {
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -19,14 +19,16 @@
                               "x86_64-unknown-linux-gnu");
   std::string DL2 = UpgradeDataLayoutString(
       "e-m:w-p:32:32-i64:64-f80:32-n8:16:32-S32", "i686-pc-windows-msvc");
-  std::string DL3 = UpgradeDataLayoutString("e-m:o-i64:64-i128:128-n32:64-S128",
-                                            "x86_64-apple-macosx");
-  EXPECT_EQ(DL1, "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64"
-                 "-f80:128-n8:16:32:64-S128");
-  EXPECT_EQ(DL2, "e-m:w-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64"
-                 "-f80:128-n8:16:32-S32");
-  EXPECT_EQ(DL3, "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128"
-                 "-n32:64-S128");
+  std::string DL3 = UpgradeDataLayoutString(
+      "e-m:o-i64:64-f80:128-n8:16:32:64-S128", "x86_64-apple-macosx");
+  EXPECT_EQ(DL1,
+            "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128"
+            "-f80:128-n8:16:32:64-S128");
+  EXPECT_EQ(DL2,
+            "e-m:w-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128"
+            "-f80:128-n8:16:32-S32");
+  EXPECT_EQ(DL3, "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:"
+                 "128-n8:16:32:64-S128");
 
   // Check that AMDGPU targets add -G1 if it's not present.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "e-p:32:32-G1");
@@ -58,21 +60,22 @@
 
 TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
   std::string DL1 = UpgradeDataLayoutString(
-      "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
+      "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-"
+      "f32:32:32"
       "-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
       "-n8:16:32:64-S128",
       "x86_64-unknown-linux-gnu");
-  std::string DL2 = UpgradeDataLayoutString("e-p:32:32", "i686-apple-darwin9");
-  std::string DL3 = UpgradeDataLayoutString("e-m:e-i64:64-n32:64",
+  std::string DL2 = UpgradeDataLayoutString("e-m:e-i64:64-n32:64",
                                             "powerpc64le-unknown-linux-gnu");
-  std::string DL4 =
+  std::string DL3 =
       UpgradeDataLayoutString("e-m:o-i64:64-i128:128-n32:64-S128", "aarch64--");
-  EXPECT_EQ(DL1, "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64"
-                 "-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64"
-                 "-f80:128:128-n8:16:32:64-S128");
-  EXPECT_EQ(DL2, "e-p:32:32");
-  EXPECT_EQ(DL3, "e-m:e-i64:64-n32:64");
-  EXPECT_EQ(DL4, "e-m:o-i64:64-i128:128-n32:64-S128");
+  EXPECT_EQ(
+      DL1,
+      "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128"
+      "-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64"
+      "-f80:128:128-n8:16:32:64-S128");
+  EXPECT_EQ(DL2, "e-m:e-i64:64-n32:64");
+  EXPECT_EQ(DL3, "e-m:o-i64:64-i128:128-n32:64-S128");
 
   // Check that AMDGPU targets don't add -G1 if there is already a -G flag.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"), "e-p:32:32-G2");
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -70,8 +70,8 @@
   void SetUp() {
     // Boilerplate that creates a MachineFunction and associated blocks.
 
-    Mod->setDataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-"
-                       "n8:16:32:64-S128");
+    Mod->setDataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
+                       "f80:128-n8:16:32:64-S128");
     Triple TargetTriple("x86_64--");
     std::string Error;
     const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error);
@@ -475,8 +475,8 @@
     auto MIRParse = createMIRParser(std::move(MemBuf), Ctx);
     Mod = MIRParse->parseIRModule();
     assert(Mod);
-    Mod->setDataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-"
-                       "n8:16:32:64-S128");
+    Mod->setDataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
+                       "f80:128-n8:16:32:64-S128");
 
     bool Result = MIRParse->parseMachineFunctions(*Mod, *MMI);
     assert(!Result && "Failed to parse unit test machine function?");