Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -87,7 +87,7 @@
     cl::Hidden);
 
 static cl::opt<bool> ExperimentalUnorderedISEL(
-    "x86-experimental-unordered-atomic-isel", cl::init(false),
+    "x86-experimental-unordered-atomic-isel", cl::init(true),
     cl::desc("Use LoadSDNode and StoreSDNode instead of "
              "AtomicSDNode for unordered atomic loads and "
              "stores respectively."),
Index: llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
===================================================================
--- llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
+++ llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
@@ -12,23 +12,13 @@
 define void @store_fp128(fp128* %fptr, fp128 %v) {
 ; X64-NOSSE-LABEL: store_fp128:
 ; X64-NOSSE:       # %bb.0:
-; X64-NOSSE-NEXT:    pushq %rax
-; X64-NOSSE-NEXT:    .cfi_def_cfa_offset 16
-; X64-NOSSE-NEXT:    callq __sync_lock_test_and_set_16
-; X64-NOSSE-NEXT:    popq %rax
-; X64-NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; X64-NOSSE-NEXT:    movq %rdx, 8(%rdi)
+; X64-NOSSE-NEXT:    movq %rsi, (%rdi)
 ; X64-NOSSE-NEXT:    retq
 ;
 ; X64-SSE-LABEL: store_fp128:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    subq $24, %rsp
-; X64-SSE-NEXT:    .cfi_def_cfa_offset 32
-; X64-SSE-NEXT:    movaps %xmm0, (%rsp)
-; X64-SSE-NEXT:    movq (%rsp), %rsi
-; X64-SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-SSE-NEXT:    callq __sync_lock_test_and_set_16
-; X64-SSE-NEXT:    addq $24, %rsp
-; X64-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
 ; X64-SSE-NEXT:    retq
   store atomic fp128 %v, fp128* %fptr unordered, align 16
   ret void
Index: llvm/test/CodeGen/X86/atomic-non-integer.ll
===================================================================
--- llvm/test/CodeGen/X86/atomic-non-integer.ll
+++ llvm/test/CodeGen/X86/atomic-non-integer.ll
@@ -114,12 +114,26 @@
 }
 
 define void @store_float(float* %fptr, float %v) {
-; X86-LABEL: store_float:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    retl
+; X86-SSE-LABEL: store_float:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movss %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: store_float:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovss %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X86-NOSSE-LABEL: store_float:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    fstps (%eax)
+; X86-NOSSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: store_float:
 ; X64-SSE:       # %bb.0:
@@ -162,16 +176,16 @@
 ;
 ; X86-SSE2-LABEL: store_double:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, (%eax)
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movsd %xmm0, (%eax)
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: store_double:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, (%eax)
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovsd %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-NOSSE-LABEL: store_double:
@@ -215,87 +229,65 @@
 define void @store_fp128(fp128* %fptr, fp128 %v) {
 ; X86-SSE-LABEL: store_fp128:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    subl $36, %esp
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 36
-; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    calll __sync_lock_test_and_set_16
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset -4
-; X86-SSE-NEXT:    addl $56, %esp
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset -56
+; X86-SSE-NEXT:    pushl %edi
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 12
+; X86-SSE-NEXT:    .cfi_offset %esi, -12
+; X86-SSE-NEXT:    .cfi_offset %edi, -8
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT:    movl %esi, 12(%edi)
+; X86-SSE-NEXT:    movl %edx, 8(%edi)
+; X86-SSE-NEXT:    movl %ecx, 4(%edi)
+; X86-SSE-NEXT:    movl %eax, (%edi)
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE-NEXT:    popl %edi
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-SSE-NEXT:    retl
 ;
 ; X86-AVX-LABEL: store_fp128:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    subl $44, %esp
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 48
-; X86-AVX-NEXT:    vmovaps {{[0-9]+}}(%esp), %xmm0
+; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovups %xmm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl %eax, (%esp)
-; X86-AVX-NEXT:    calll __sync_lock_test_and_set_16
-; X86-AVX-NEXT:    addl $40, %esp
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl
 ;
 ; X86-NOSSE-LABEL: store_fp128:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    subl $36, %esp
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 36
-; X86-NOSSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    calll __sync_lock_test_and_set_16
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset -4
-; X86-NOSSE-NEXT:    addl $56, %esp
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset -56
+; X86-NOSSE-NEXT:    pushl %edi
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; X86-NOSSE-NEXT:    .cfi_offset %esi, -12
+; X86-NOSSE-NEXT:    .cfi_offset %edi, -8
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOSSE-NEXT:    movl %esi, 12(%edi)
+; X86-NOSSE-NEXT:    movl %edx, 8(%edi)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%edi)
+; X86-NOSSE-NEXT:    movl %eax, (%edi)
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-NOSSE-NEXT:    popl %edi
+; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NOSSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: store_fp128:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    subq $24, %rsp
-; X64-SSE-NEXT:    .cfi_def_cfa_offset 32
-; X64-SSE-NEXT:    movaps %xmm0, (%rsp)
-; X64-SSE-NEXT:    movq (%rsp), %rsi
-; X64-SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-SSE-NEXT:    callq __sync_lock_test_and_set_16
-; X64-SSE-NEXT:    addq $24, %rsp
-; X64-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: store_fp128:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    subq $24, %rsp
-; X64-AVX-NEXT:    .cfi_def_cfa_offset 32
-; X64-AVX-NEXT:    vmovaps %xmm0, (%rsp)
-; X64-AVX-NEXT:    movq (%rsp), %rsi
-; X64-AVX-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-AVX-NEXT:    callq __sync_lock_test_and_set_16
-; X64-AVX-NEXT:    addq $24, %rsp
-; X64-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
 ; X64-AVX-NEXT:    retq
   store atomic fp128 %v, fp128* %fptr unordered, align 16
   ret void
@@ -383,53 +375,11 @@
 }
 
 define float @load_float(float* %fptr) {
-; X86-SSE1-LABEL: load_float:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %eax
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl (%eax), %eax
-; X86-SSE1-NEXT:    movl %eax, (%esp)
-; X86-SSE1-NEXT:    flds (%esp)
-; X86-SSE1-NEXT:    popl %eax
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: load_float:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pushl %eax
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT:    movss %xmm0, (%esp)
-; X86-SSE2-NEXT:    flds (%esp)
-; X86-SSE2-NEXT:    popl %eax
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 4
-; X86-SSE2-NEXT:    retl
-;
-; X86-AVX-LABEL: load_float:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %eax
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
-; X86-AVX-NEXT:    flds (%esp)
-; X86-AVX-NEXT:    popl %eax
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
-; X86-AVX-NEXT:    retl
-;
-; X86-NOSSE-LABEL: load_float:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl (%eax), %eax
-; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    flds (%esp)
-; X86-NOSSE-NEXT:    popl %eax
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
-; X86-NOSSE-NEXT:    retl
+; X86-LABEL: load_float:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    flds (%eax)
+; X86-NEXT:    retl
 ;
 ; X64-SSE-LABEL: load_float:
 ; X64-SSE:       # %bb.0:
@@ -445,61 +395,11 @@
 }
 
 define double @load_double(double* %fptr) {
-; X86-SSE1-LABEL: load_double:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    subl $20, %esp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 24
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    fildll (%eax)
-; X86-SSE1-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    movl %eax, (%esp)
-; X86-SSE1-NEXT:    fldl (%esp)
-; X86-SSE1-NEXT:    addl $20, %esp
-; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: load_double:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    subl $12, %esp
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-NEXT:    movlps %xmm0, (%esp)
-; X86-SSE2-NEXT:    fldl (%esp)
-; X86-SSE2-NEXT:    addl $12, %esp
-; X86-SSE2-NEXT:    .cfi_def_cfa_offset 4
-; X86-SSE2-NEXT:    retl
-;
-; X86-AVX-LABEL: load_double:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    subl $12, %esp
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 16
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX-NEXT:    vmovlps %xmm0, (%esp)
-; X86-AVX-NEXT:    fldl (%esp)
-; X86-AVX-NEXT:    addl $12, %esp
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
-; X86-AVX-NEXT:    retl
-;
-; X86-NOSSE-LABEL: load_double:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    subl $20, %esp
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 24
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    fildll (%eax)
-; X86-NOSSE-NEXT:    fistpll {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    movl %eax, (%esp)
-; X86-NOSSE-NEXT:    fldl (%esp)
-; X86-NOSSE-NEXT:    addl $20, %esp
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 4
-; X86-NOSSE-NEXT:    retl
+; X86-LABEL: load_double:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    fldl (%eax)
+; X86-NEXT:    retl
 ;
 ; X64-SSE-LABEL: load_double:
 ; X64-SSE:       # %bb.0:
@@ -515,85 +415,44 @@
 }
 
 define fp128 @load_fp128(fp128* %fptr) {
-; X86-SSE-LABEL: load_fp128:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %edi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE-NEXT:    subl $20, %esp
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 32
-; X86-SSE-NEXT:    .cfi_offset %esi, -12
-; X86-SSE-NEXT:    .cfi_offset %edi, -8
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE-NEXT:    subl $8, %esp
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 8
-; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl $0
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-SSE-NEXT:    calll __sync_val_compare_and_swap_16
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset -4
-; X86-SSE-NEXT:    addl $44, %esp
-; X86-SSE-NEXT:    .cfi_adjust_cfa_offset -44
-; X86-SSE-NEXT:    movl (%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE-NEXT:    movl %edi, 8(%esi)
-; X86-SSE-NEXT:    movl %edx, 12(%esi)
-; X86-SSE-NEXT:    movl %eax, (%esi)
-; X86-SSE-NEXT:    movl %ecx, 4(%esi)
-; X86-SSE-NEXT:    movl %esi, %eax
-; X86-SSE-NEXT:    addl $20, %esp
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 12
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    popl %edi
-; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
-; X86-SSE-NEXT:    retl $4
+; X86-SSE1-LABEL: load_fp128:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %edi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    pushl %esi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 12
+; X86-SSE1-NEXT:    .cfi_offset %esi, -12
+; X86-SSE1-NEXT:    .cfi_offset %edi, -8
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movl (%ecx), %edx
+; X86-SSE1-NEXT:    movl 4(%ecx), %esi
+; X86-SSE1-NEXT:    movl 8(%ecx), %edi
+; X86-SSE1-NEXT:    movl 12(%ecx), %ecx
+; X86-SSE1-NEXT:    movl %ecx, 12(%eax)
+; X86-SSE1-NEXT:    movl %edi, 8(%eax)
+; X86-SSE1-NEXT:    movl %esi, 4(%eax)
+; X86-SSE1-NEXT:    movl %edx, (%eax)
+; X86-SSE1-NEXT:    popl %esi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT:    popl %edi
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT:    retl $4
+;
+; X86-SSE2-LABEL: load_fp128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movaps (%ecx), %xmm0
+; X86-SSE2-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-AVX-LABEL: load_fp128:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    subl $56, %esp
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 64
-; X86-AVX-NEXT:    .cfi_offset %esi, -8
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl %eax, (%esp)
-; X86-AVX-NEXT:    vzeroupper
-; X86-AVX-NEXT:    calll __sync_val_compare_and_swap_16
-; X86-AVX-NEXT:    subl $4, %esp
-; X86-AVX-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
-; X86-AVX-NEXT:    vmovaps %xmm0, (%esi)
-; X86-AVX-NEXT:    movl %esi, %eax
-; X86-AVX-NEXT:    addl $56, %esp
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX-NEXT:    vmovaps (%ecx), %xmm0
+; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
 ; X86-AVX-NEXT:    retl $4
 ;
 ; X86-NOSSE-LABEL: load_fp128:
@@ -602,49 +461,18 @@
 ; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 12
-; X86-NOSSE-NEXT:    subl $20, %esp
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 32
 ; X86-NOSSE-NEXT:    .cfi_offset %esi, -12
 ; X86-NOSSE-NEXT:    .cfi_offset %edi, -8
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    subl $8, %esp
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 8
-; X86-NOSSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset 4
-; X86-NOSSE-NEXT:    calll __sync_val_compare_and_swap_16
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset -4
-; X86-NOSSE-NEXT:    addl $44, %esp
-; X86-NOSSE-NEXT:    .cfi_adjust_cfa_offset -44
-; X86-NOSSE-NEXT:    movl (%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, 8(%esi)
-; X86-NOSSE-NEXT:    movl %edx, 12(%esi)
-; X86-NOSSE-NEXT:    movl %eax, (%esi)
-; X86-NOSSE-NEXT:    movl %ecx, 4(%esi)
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    addl $20, %esp
-; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 12
+; X86-NOSSE-NEXT:    movl (%ecx), %edx
+; X86-NOSSE-NEXT:    movl 4(%ecx), %esi
+; X86-NOSSE-NEXT:    movl 8(%ecx), %edi
+; X86-NOSSE-NEXT:    movl 12(%ecx), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %edi, 8(%eax)
+; X86-NOSSE-NEXT:    movl %esi, 4(%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NOSSE-NEXT:    popl %edi
@@ -653,34 +481,12 @@
 ;
 ; X64-SSE-LABEL: load_fp128:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    subq $24, %rsp
-; X64-SSE-NEXT:    .cfi_def_cfa_offset 32
-; X64-SSE-NEXT:    xorl %esi, %esi
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    xorl %ecx, %ecx
-; X64-SSE-NEXT:    xorl %r8d, %r8d
-; X64-SSE-NEXT:    callq __sync_val_compare_and_swap_16
-; X64-SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; X64-SSE-NEXT:    movq %rax, (%rsp)
-; X64-SSE-NEXT:    movaps (%rsp), %xmm0
-; X64-SSE-NEXT:    addq $24, %rsp
-; X64-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X64-SSE-NEXT:    movaps (%rdi), %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: load_fp128:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    subq $24, %rsp
-; X64-AVX-NEXT:    .cfi_def_cfa_offset 32
-; X64-AVX-NEXT:    xorl %esi, %esi
-; X64-AVX-NEXT:    xorl %edx, %edx
-; X64-AVX-NEXT:    xorl %ecx, %ecx
-; X64-AVX-NEXT:    xorl %r8d, %r8d
-; X64-AVX-NEXT:    callq __sync_val_compare_and_swap_16
-; X64-AVX-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rax, (%rsp)
-; X64-AVX-NEXT:    vmovaps (%rsp), %xmm0
-; X64-AVX-NEXT:    addq $24, %rsp
-; X64-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
 ; X64-AVX-NEXT:    retq
   %v = load atomic fp128, fp128* %fptr unordered, align 16
   ret fp128 %v
Index: llvm/test/CodeGen/X86/atomic-unordered.ll
===================================================================
--- llvm/test/CodeGen/X86/atomic-unordered.ll
+++ llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O0,CHECK-O0-CUR %s
-; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake | FileCheck --check-prefixes=CHECK,CHECK-O3,CHECK-O3-CUR %s
-; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel | FileCheck --check-prefixes=CHECK,CHECK-O0,CHECK-O0-EX %s
-; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel | FileCheck --check-prefixes=CHECK,CHECK-O3,CHECK-O3-EX %s
+; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel=0 | FileCheck --check-prefixes=CHECK,CHECK-O0,CHECK-O0-CUR %s
+; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake  -x86-experimental-unordered-atomic-isel=0 | FileCheck --check-prefixes=CHECK,CHECK-O3,CHECK-O3-CUR %s
+; RUN: llc -O0 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel=1 | FileCheck --check-prefixes=CHECK,CHECK-O0,CHECK-O0-EX %s
+; RUN: llc -O3 < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mcpu=skylake -x86-experimental-unordered-atomic-isel=1 | FileCheck --check-prefixes=CHECK,CHECK-O3,CHECK-O3-EX %s
 
 define i8 @load_i8(i8* %ptr) {
 ; CHECK-LABEL: load_i8:
Index: llvm/test/CodeGen/X86/atomic128.ll
===================================================================
--- llvm/test/CodeGen/X86/atomic128.ll
+++ llvm/test/CodeGen/X86/atomic128.ll
@@ -893,25 +893,25 @@
 ;
 ; CHECK32-LABEL: atomic_store_relaxed:
 ; CHECK32:       # %bb.0:
-; CHECK32-NEXT:    subl $36, %esp
-; CHECK32-NEXT:    .cfi_adjust_cfa_offset 36
-; CHECK32-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    .cfi_adjust_cfa_offset 4
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    .cfi_adjust_cfa_offset 4
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    .cfi_adjust_cfa_offset 4
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    .cfi_adjust_cfa_offset 4
-; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
-; CHECK32-NEXT:    .cfi_adjust_cfa_offset 4
-; CHECK32-NEXT:    pushl %eax
-; CHECK32-NEXT:    .cfi_adjust_cfa_offset 4
-; CHECK32-NEXT:    calll __sync_lock_test_and_set_16
-; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
-; CHECK32-NEXT:    addl $56, %esp
-; CHECK32-NEXT:    .cfi_adjust_cfa_offset -56
+; CHECK32-NEXT:    pushl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 12
+; CHECK32-NEXT:    .cfi_offset %esi, -12
+; CHECK32-NEXT:    .cfi_offset %edi, -8
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK32-NEXT:    movl %esi, 12(%edi)
+; CHECK32-NEXT:    movl %edx, 8(%edi)
+; CHECK32-NEXT:    movl %ecx, 4(%edi)
+; CHECK32-NEXT:    movl %eax, (%edi)
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK32-NEXT:    popl %edi
+; CHECK32-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK32-NEXT:    retl
    store atomic i128 %in, i128* %p unordered, align 16
    ret void
Index: llvm/test/CodeGen/X86/combineIncDecVector-crash.ll
===================================================================
--- llvm/test/CodeGen/X86/combineIncDecVector-crash.ll
+++ llvm/test/CodeGen/X86/combineIncDecVector-crash.ll
@@ -19,11 +19,11 @@
 ; CHECK-NEXT:    callq newarray
 ; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    addss (%rax), %xmm0
 ; CHECK-NEXT:    movdqu (%rax), %xmm1
 ; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
 ; CHECK-NEXT:    psubd %xmm2, %xmm1
 ; CHECK-NEXT:    movdqu %xmm1, (%rax)
+; CHECK-NEXT:    addss {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    movss %xmm0, (%rax)
 bci_0:
    %token418 = call token (i64, i32, i8 * (i64, i32, i32, i32)*, i32,