Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -277,7 +277,8 @@
       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
     } else {
-      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
+      if (UseX87)
+        setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
     }
   } else if (!Subtarget.useSoftFloat()) {
@@ -547,11 +548,13 @@
     // cases we handle.
     addLegalFPImmediate(APFloat(+0.0)); // xorpd
     addLegalFPImmediate(APFloat(+0.0f)); // xorps
-  } else if (UseX87 && X86ScalarSSEf32) {
+  } else if (X86ScalarSSEf32 &&
+             (UseX87 || (!useSoftFloat() && Subtarget.is64Bit()))) {
     // Use SSE for f32, x87 for f64.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, &X86::FR32RegClass);
-    addRegisterClass(MVT::f64, &X86::RFP64RegClass);
+    if (UseX87)
+      addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 
     // Use ANDPS to simulate FABS.
     setOperationAction(ISD::FABS , MVT::f32, Custom);
@@ -559,10 +562,12 @@
     // Use XORP to simulate FNEG.
     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 
-    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+    if (UseX87)
+      setOperationAction(ISD::UNDEF, MVT::f64, Expand);
 
     // Use ANDPS and ORPS to simulate FCOPYSIGN.
-    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    if (UseX87)
+      setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
     // We don't support sin/cos/fmod
@@ -572,15 +577,17 @@
 
     // Special cases we handle for FP constants.
     addLegalFPImmediate(APFloat(+0.0f)); // xorps
-    addLegalFPImmediate(APFloat(+0.0)); // FLD0
-    addLegalFPImmediate(APFloat(+1.0)); // FLD1
-    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
-    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+    if (UseX87) {
+      addLegalFPImmediate(APFloat(+0.0)); // FLD0
+      addLegalFPImmediate(APFloat(+1.0)); // FLD1
+      addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+      addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 
-    // Always expand sin/cos functions even though x87 has an instruction.
-    setOperationAction(ISD::FSIN   , MVT::f64, Expand);
-    setOperationAction(ISD::FCOS   , MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+      // Always expand sin/cos functions even though x87 has an instruction.
+      setOperationAction(ISD::FSIN, MVT::f64, Expand);
+      setOperationAction(ISD::FCOS, MVT::f64, Expand);
+      setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    }
   } else if (UseX87) {
     // f32 and f64 in x87.
     // Set up the FP register classes.
@@ -1936,7 +1943,8 @@
       if (Subtarget.hasSSE2())
         return MVT::v16i8;
       // TODO: Can SSE1 handle a byte vector?
-      if (Subtarget.hasSSE1())
+      // If we have SSE1 regsiters we should be able to use them.
+      if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()))
         return MVT::v4f32;
     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
Index: llvm/test/CodeGen/X86/pr38738.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/pr38738.ll
@@ -0,0 +1,254 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o - -mattr=-x87,+sse,-sse2 %s  | FileCheck --check-prefixes=X64SSE %s
+; RUN: llc -mtriple=i686-unknown-linux-gnu -o - -mattr=-x87,+sse,-sse2 %s    | FileCheck --check-prefixes=X86SSE %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o - -mattr=-x87,+sse2,-sse3 %s | FileCheck --check-prefixes=X64SSE2 %s
+; RUN: llc -mtriple=i686-unknown-linux-gnu -o - -mattr=-x87,+sse2,-sse3 %s   | FileCheck --check-prefixes=X86SSE2 %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -o - -mattr=-x87,+avx,-avx2 %s  | FileCheck --check-prefixes=X64AVX %s
+; RUN: llc -mtriple=i686-unknown-linux-gnu -o - -mattr=-x87,+avx,-avx2 %s    | FileCheck --check-prefixes=X86AVX %s
+
+
+%struct.params = type { double, double }
+
+define dso_local i32 @pr38738() {
+; X64SSE-LABEL: pr38738:
+; X64SSE:       # %bb.0: # %entry
+; X64SSE-NEXT:    xorps %xmm0, %xmm0
+; X64SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64SSE-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; X64SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64SSE-NEXT:    retq
+;
+; X86SSE-LABEL: pr38738:
+; X86SSE:       # %bb.0: # %entry
+; X86SSE-NEXT:    subl $28, %esp
+; X86SSE-NEXT:    .cfi_def_cfa_offset 32
+; X86SSE-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86SSE-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86SSE-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86SSE-NEXT:    movl $0, (%esp)
+; X86SSE-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86SSE-NEXT:    addl $28, %esp
+; X86SSE-NEXT:    .cfi_def_cfa_offset 4
+; X86SSE-NEXT:    retl
+;
+; X64SSE2-LABEL: pr38738:
+; X64SSE2:       # %bb.0: # %entry
+; X64SSE2-NEXT:    xorps %xmm0, %xmm0
+; X64SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64SSE2-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; X64SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64SSE2-NEXT:    retq
+;
+; X86SSE2-LABEL: pr38738:
+; X86SSE2:       # %bb.0: # %entry
+; X86SSE2-NEXT:    subl $44, %esp
+; X86SSE2-NEXT:    .cfi_def_cfa_offset 48
+; X86SSE2-NEXT:    xorps %xmm0, %xmm0
+; X86SSE2-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86SSE2-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86SSE2-NEXT:    addl $44, %esp
+; X86SSE2-NEXT:    .cfi_def_cfa_offset 4
+; X86SSE2-NEXT:    retl
+;
+; X64AVX-LABEL: pr38738:
+; X64AVX:       # %bb.0: # %entry
+; X64AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X64AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64AVX-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
+; X64AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; X64AVX-NEXT:    retq
+;
+; X86AVX-LABEL: pr38738:
+; X86AVX:       # %bb.0: # %entry
+; X86AVX-NEXT:    subl $44, %esp
+; X86AVX-NEXT:    .cfi_def_cfa_offset 48
+; X86AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86AVX-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86AVX-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86AVX-NEXT:    addl $44, %esp
+; X86AVX-NEXT:    .cfi_def_cfa_offset 4
+; X86AVX-NEXT:    retl
+entry:
+  %retval = alloca i32, align 4
+  %dlg_sys_param = alloca %struct.params, align 8
+  %total_active_bw = alloca float, align 4
+  %0 = bitcast %struct.params* %dlg_sys_param to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 8 %0, i8 0, i64 16, i1 false)
+  store float 0.000000e+00, float* %total_active_bw, align 4
+  %1 = load i32, i32* %retval, align 4
+  ret i32 %1
+}
+
+define dso_local void @tryset(i8* nocapture %x) local_unnamed_addr {
+; X64SSE-LABEL: tryset:
+; X64SSE:       # %bb.0:
+; X64SSE-NEXT:    movq $0, 56(%rdi)
+; X64SSE-NEXT:    movq $0, 48(%rdi)
+; X64SSE-NEXT:    movq $0, 40(%rdi)
+; X64SSE-NEXT:    movq $0, 32(%rdi)
+; X64SSE-NEXT:    movq $0, 24(%rdi)
+; X64SSE-NEXT:    movq $0, 16(%rdi)
+; X64SSE-NEXT:    movq $0, 8(%rdi)
+; X64SSE-NEXT:    movq $0, (%rdi)
+; X64SSE-NEXT:    retq
+;
+; X86SSE-LABEL: tryset:
+; X86SSE:       # %bb.0:
+; X86SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86SSE-NEXT:    movl $0, 60(%eax)
+; X86SSE-NEXT:    movl $0, 56(%eax)
+; X86SSE-NEXT:    movl $0, 52(%eax)
+; X86SSE-NEXT:    movl $0, 48(%eax)
+; X86SSE-NEXT:    movl $0, 44(%eax)
+; X86SSE-NEXT:    movl $0, 40(%eax)
+; X86SSE-NEXT:    movl $0, 36(%eax)
+; X86SSE-NEXT:    movl $0, 32(%eax)
+; X86SSE-NEXT:    movl $0, 28(%eax)
+; X86SSE-NEXT:    movl $0, 24(%eax)
+; X86SSE-NEXT:    movl $0, 20(%eax)
+; X86SSE-NEXT:    movl $0, 16(%eax)
+; X86SSE-NEXT:    movl $0, 12(%eax)
+; X86SSE-NEXT:    movl $0, 8(%eax)
+; X86SSE-NEXT:    movl $0, 4(%eax)
+; X86SSE-NEXT:    movl $0, (%eax)
+; X86SSE-NEXT:    retl
+;
+; X64SSE2-LABEL: tryset:
+; X64SSE2:       # %bb.0:
+; X64SSE2-NEXT:    movq $0, 56(%rdi)
+; X64SSE2-NEXT:    movq $0, 48(%rdi)
+; X64SSE2-NEXT:    movq $0, 40(%rdi)
+; X64SSE2-NEXT:    movq $0, 32(%rdi)
+; X64SSE2-NEXT:    movq $0, 24(%rdi)
+; X64SSE2-NEXT:    movq $0, 16(%rdi)
+; X64SSE2-NEXT:    movq $0, 8(%rdi)
+; X64SSE2-NEXT:    movq $0, (%rdi)
+; X64SSE2-NEXT:    retq
+;
+; X86SSE2-LABEL: tryset:
+; X86SSE2:       # %bb.0:
+; X86SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86SSE2-NEXT:    movl $0, 4(%eax)
+; X86SSE2-NEXT:    movl $0, (%eax)
+; X86SSE2-NEXT:    movl $0, 12(%eax)
+; X86SSE2-NEXT:    movl $0, 8(%eax)
+; X86SSE2-NEXT:    movl $0, 20(%eax)
+; X86SSE2-NEXT:    movl $0, 16(%eax)
+; X86SSE2-NEXT:    movl $0, 28(%eax)
+; X86SSE2-NEXT:    movl $0, 24(%eax)
+; X86SSE2-NEXT:    movl $0, 36(%eax)
+; X86SSE2-NEXT:    movl $0, 32(%eax)
+; X86SSE2-NEXT:    movl $0, 44(%eax)
+; X86SSE2-NEXT:    movl $0, 40(%eax)
+; X86SSE2-NEXT:    movl $0, 52(%eax)
+; X86SSE2-NEXT:    movl $0, 48(%eax)
+; X86SSE2-NEXT:    movl $0, 60(%eax)
+; X86SSE2-NEXT:    movl $0, 56(%eax)
+; X86SSE2-NEXT:    retl
+;
+; X64AVX-LABEL: tryset:
+; X64AVX:       # %bb.0:
+; X64AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X64AVX-NEXT:    vmovups %ymm0, 32(%rdi)
+; X64AVX-NEXT:    vmovups %ymm0, (%rdi)
+; X64AVX-NEXT:    vzeroupper
+; X64AVX-NEXT:    retq
+;
+; X86AVX-LABEL: tryset:
+; X86AVX:       # %bb.0:
+; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; X86AVX-NEXT:    vmovups %ymm0, 32(%eax)
+; X86AVX-NEXT:    vmovups %ymm0, (%eax)
+; X86AVX-NEXT:    vzeroupper
+; X86AVX-NEXT:    retl
+  tail call void @llvm.memset.p0i8.i64(i8* align 1 %x, i8 0, i64 64, i1 false)
+  ret void
+}
+
+define dso_local void @trycpy(i8* nocapture %x, i8* nocapture readonly %y) local_unnamed_addr {
+; X64SSE-LABEL: trycpy:
+; X64SSE:       # %bb.0:
+; X64SSE-NEXT:    movq 24(%rsi), %rax
+; X64SSE-NEXT:    movq %rax, 24(%rdi)
+; X64SSE-NEXT:    movq 16(%rsi), %rax
+; X64SSE-NEXT:    movq %rax, 16(%rdi)
+; X64SSE-NEXT:    movq (%rsi), %rax
+; X64SSE-NEXT:    movq 8(%rsi), %rcx
+; X64SSE-NEXT:    movq %rcx, 8(%rdi)
+; X64SSE-NEXT:    movq %rax, (%rdi)
+; X64SSE-NEXT:    retq
+;
+; X86SSE-LABEL: trycpy:
+; X86SSE:       # %bb.0:
+; X86SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86SSE-NEXT:    movl 28(%ecx), %edx
+; X86SSE-NEXT:    movl %edx, 28(%eax)
+; X86SSE-NEXT:    movl 24(%ecx), %edx
+; X86SSE-NEXT:    movl %edx, 24(%eax)
+; X86SSE-NEXT:    movl 20(%ecx), %edx
+; X86SSE-NEXT:    movl %edx, 20(%eax)
+; X86SSE-NEXT:    movl 16(%ecx), %edx
+; X86SSE-NEXT:    movl %edx, 16(%eax)
+; X86SSE-NEXT:    movl 12(%ecx), %edx
+; X86SSE-NEXT:    movl %edx, 12(%eax)
+; X86SSE-NEXT:    movl 8(%ecx), %edx
+; X86SSE-NEXT:    movl %edx, 8(%eax)
+; X86SSE-NEXT:    movl (%ecx), %edx
+; X86SSE-NEXT:    movl 4(%ecx), %ecx
+; X86SSE-NEXT:    movl %ecx, 4(%eax)
+; X86SSE-NEXT:    movl %edx, (%eax)
+; X86SSE-NEXT:    retl
+;
+; X64SSE2-LABEL: trycpy:
+; X64SSE2:       # %bb.0:
+; X64SSE2-NEXT:    movq 24(%rsi), %rax
+; X64SSE2-NEXT:    movq %rax, 24(%rdi)
+; X64SSE2-NEXT:    movq 16(%rsi), %rax
+; X64SSE2-NEXT:    movq %rax, 16(%rdi)
+; X64SSE2-NEXT:    movq (%rsi), %rax
+; X64SSE2-NEXT:    movq 8(%rsi), %rcx
+; X64SSE2-NEXT:    movq %rcx, 8(%rdi)
+; X64SSE2-NEXT:    movq %rax, (%rdi)
+; X64SSE2-NEXT:    retq
+;
+; X86SSE2-LABEL: trycpy:
+; X86SSE2:       # %bb.0:
+; X86SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86SSE2-NEXT:    movsd %xmm0, 24(%eax)
+; X86SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86SSE2-NEXT:    movsd %xmm0, 16(%eax)
+; X86SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86SSE2-NEXT:    movsd %xmm1, 8(%eax)
+; X86SSE2-NEXT:    movsd %xmm0, (%eax)
+; X86SSE2-NEXT:    retl
+;
+; X64AVX-LABEL: trycpy:
+; X64AVX:       # %bb.0:
+; X64AVX-NEXT:    vmovups (%rsi), %ymm0
+; X64AVX-NEXT:    vmovups %ymm0, (%rdi)
+; X64AVX-NEXT:    vzeroupper
+; X64AVX-NEXT:    retq
+;
+; X86AVX-LABEL: trycpy:
+; X86AVX:       # %bb.0:
+; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86AVX-NEXT:    vmovups (%ecx), %ymm0
+; X86AVX-NEXT:    vmovups %ymm0, (%eax)
+; X86AVX-NEXT:    vzeroupper
+; X86AVX-NEXT:    retl
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %x, i8* align 1 %y, i64 32, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #2
+
Index: llvm/test/CodeGen/X86/x87.ll
===================================================================
--- llvm/test/CodeGen/X86/x87.ll
+++ llvm/test/CodeGen/X86/x87.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=i686-- | FileCheck %s -check-prefix=X87
-; RUN: llc < %s -mtriple=x86_64-- -mattr=-sse | FileCheck %s -check-prefix=X87
-; RUN: llc < %s -mtriple=i686-- -mattr=-x87 | FileCheck %s -check-prefix=NOX87
-; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87
-; RUN: llc < %s -mtriple=i686-- -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87
-; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s -check-prefixes=X8732,X87
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-sse | FileCheck %s -check-prefixes=X8732,X87
+; RUN: llc < %s -mtriple=i686-- -mattr=-x87 | FileCheck %s -check-prefixes=NOX8732,NOX87
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87,-sse | FileCheck %s -check-prefixes=NOX8732,NOX87
+; RUN: llc < %s -mtriple=i686-- -mattr=-x87,+sse | FileCheck %s -check-prefixes=NOX8732,NOX87
+; RUN: llc < %s -mtriple=x86_64-- -mattr=-x87,-sse2 | FileCheck %s -check-prefixes=X8732_SSE,NOX87
 
 define void @test(i32 %i, i64 %l, float* %pf, double* %pd, fp128* %pld) nounwind readnone {
 ; X87-LABEL: test:
@@ -12,18 +12,18 @@
 ; NOX87-NOT: {{ }}f{{.*}}
 
 ; X87: fild
-; NOX87: __floatunsisf
+; NOX8732: __floatunsisf
   %tmp = uitofp i32 %i to float
 
-; X87: fild
-; NOX87: __floatdisf
+; X8732: fild
+; NOX8732: __floatdisf
   %tmp1 = sitofp i64 %l to float
 
-; X87: fadd
-; NOX87: __addsf3
+; X8732: fadd
+; NOX8732: __addsf3
   %tmp2 = fadd float %tmp, %tmp1
 
-; X87: fstp
+; X8732: fstp
   store float %tmp2, float* %pf
 
 ; X87: fild