Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -903,8 +903,10 @@
       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 
     // We want to legalize this to an f64 load rather than an i64 load on
-    // 64-bit targets and two 32-bit loads on a 32-bit target.
+    // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
+    // store.
     setOperationAction(ISD::LOAD,               MVT::v2f32, Custom);
+    setOperationAction(ISD::STORE,              MVT::v2f32, Custom);
 
     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
@@ -19862,18 +19864,36 @@
   SDValue StoredVal = St->getValue();
 
   // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
-  assert(StoredVal.getValueType().isVector() &&
-         StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
-         StoredVal.getValueType().getVectorNumElements() <= 8 &&
-         "Unexpected VT");
-  assert(!St->isTruncatingStore() && "Expected non-truncating store");
-  assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
-         "Expected AVX512F without AVX512DQI");
-
-  StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
-                          DAG.getUNDEF(MVT::v8i1), StoredVal,
+  if (StoredVal.getValueType().isVector() &&
+            StoredVal.getValueType().getVectorElementType() == MVT::i1) {
+    assert(StoredVal.getValueType().getVectorNumElements() <= 8 &&
+           "Unexpected VT");
+    assert(!St->isTruncatingStore() && "Expected non-truncating store");
+    assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+           "Expected AVX512F without AVX512DQI");
+
+    StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+                            DAG.getUNDEF(MVT::v8i1), StoredVal,
+                            DAG.getIntPtrConstant(0, dl));
+    StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
+
+    return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+                        St->getPointerInfo(), St->getAlignment(),
+                        St->getMemOperand()->getFlags());
+  }
+
+  if (St->isTruncatingStore())
+    return SDValue();
+
+  assert(StoredVal.getValueType() == MVT::v2f32 && "Unexpected VT");
+
+  // Widen the vector, cast to a v2x64 type, extract the single 64-bit
+  // element and store it.
+  StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, StoredVal,
+                          DAG.getUNDEF(MVT::v2f32));
+  StoredVal = DAG.getBitcast(MVT::v2f64, StoredVal);
+  StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, StoredVal,
                           DAG.getIntPtrConstant(0, dl));
-  StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
 
   return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
                       St->getPointerInfo(), St->getAlignment(),
@@ -36864,7 +36884,8 @@
     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
     // pair instead.
     if (Subtarget.is64Bit() || F64IsLegal) {
-      MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
+      MVT LdVT = (Subtarget.is64Bit() &&
+                  (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64;
       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                   Ld->getMemOperand());
 
Index: test/CodeGen/X86/2011-10-19-widen_vselect.ll
===================================================================
--- test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -8,8 +8,7 @@
 define void @simple_widen(<2 x float> %a, <2 x float> %b) {
 ; X32-LABEL: simple_widen:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    extractps $1, %xmm1, (%eax)
-; X32-NEXT:    movss %xmm1, (%eax)
+; X32-NEXT:    movlps %xmm1, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: simple_widen:
@@ -28,8 +27,7 @@
 ; X32-NEXT:    movaps %xmm0, %xmm2
 ; X32-NEXT:    cmpordps %xmm0, %xmm0
 ; X32-NEXT:    blendvps %xmm0, %xmm2, %xmm1
-; X32-NEXT:    extractps $1, %xmm1, (%eax)
-; X32-NEXT:    movss %xmm1, (%eax)
+; X32-NEXT:    movlps %xmm1, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: complex_inreg_work:
@@ -50,8 +48,7 @@
 ; X32-LABEL: zero_test:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    extractps $1, %xmm0, (%eax)
-; X32-NEXT:    movss %xmm0, (%eax)
+; X32-NEXT:    movlps %xmm0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: zero_test:
@@ -82,11 +79,8 @@
 ; X32-NEXT:    cmpeqps %xmm2, %xmm1
 ; X32-NEXT:    movaps %xmm1, %xmm0
 ; X32-NEXT:    blendvps %xmm0, %xmm2, %xmm4
-; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
-; X32-NEXT:    movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; X32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
-; X32-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    movlps %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movlps %xmm4, {{[0-9]+}}(%esp)
 ; X32-NEXT:    addl $60, %esp
 ; X32-NEXT:    .cfi_def_cfa_offset 4
 ; X32-NEXT:    retl
Index: test/CodeGen/X86/2012-01-18-vbitcast.ll
===================================================================
--- test/CodeGen/X86/2012-01-18-vbitcast.ll
+++ test/CodeGen/X86/2012-01-18-vbitcast.ll
@@ -7,6 +7,10 @@
 ; CHECK-NEXT:    subq $16, %rsp
 ; CHECK-NEXT:    .seh_stackalloc 16
 ; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd %xmm0, (%rsp)
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    psubq %xmm1, %xmm0
Index: test/CodeGen/X86/sse-schedule.ll
===================================================================
--- test/CodeGen/X86/sse-schedule.ll
+++ test/CodeGen/X86/sse-schedule.ll
@@ -2712,8 +2712,7 @@
 ; GENERIC:       # %bb.0:
 ; GENERIC-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; GENERIC-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; GENERIC-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; GENERIC-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; GENERIC-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; GENERIC-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; GENERIC-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2723,16 +2722,14 @@
 ; ATOM-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
 ; ATOM-NEXT:    addps %xmm1, %xmm2 # sched: [5:5.00]
 ; ATOM-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
-; ATOM-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1] sched: [1:1.00]
-; ATOM-NEXT:    movlps %xmm2, (%rdi) # sched: [1:1.00]
+; ATOM-NEXT:    movhps %xmm2, (%rdi) # sched: [1:1.00]
 ; ATOM-NEXT:    retq # sched: [79:39.50]
 ;
 ; SLM-LABEL: test_movhps:
 ; SLM:       # %bb.0:
 ; SLM-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00]
 ; SLM-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; SLM-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SLM-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SLM-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SLM-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
 ; SLM-NEXT:    retq # sched: [4:1.00]
 ;
@@ -2740,8 +2737,7 @@
 ; SANDY-SSE:       # %bb.0:
 ; SANDY-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; SANDY-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SANDY-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SANDY-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SANDY-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; SANDY-SSE-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2749,7 +2745,7 @@
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
 ; SANDY-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SANDY-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00]
+; SANDY-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SANDY-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; SANDY-NEXT:    retq # sched: [1:1.00]
 ;
@@ -2757,8 +2753,7 @@
 ; HASWELL-SSE:       # %bb.0:
 ; HASWELL-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; HASWELL-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; HASWELL-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; HASWELL-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; HASWELL-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; HASWELL-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; HASWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2766,7 +2761,7 @@
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; HASWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; HASWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; HASWELL-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; HASWELL-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2774,8 +2769,7 @@
 ; BROADWELL-SSE:       # %bb.0:
 ; BROADWELL-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BROADWELL-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BROADWELL-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; BROADWELL-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; BROADWELL-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; BROADWELL-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:1.00]
 ; BROADWELL-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2783,7 +2777,7 @@
 ; BROADWELL:       # %bb.0:
 ; BROADWELL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BROADWELL-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BROADWELL-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; BROADWELL-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; BROADWELL-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
 ; BROADWELL-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2791,8 +2785,7 @@
 ; SKYLAKE-SSE:       # %bb.0:
 ; SKYLAKE-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKYLAKE-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [4:0.50]
-; SKYLAKE-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SKYLAKE-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SKYLAKE-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SKYLAKE-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2800,7 +2793,7 @@
 ; SKYLAKE:       # %bb.0:
 ; SKYLAKE-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKYLAKE-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKYLAKE-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKYLAKE-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SKYLAKE-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKYLAKE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2808,8 +2801,7 @@
 ; SKX-SSE:       # %bb.0:
 ; SKX-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKX-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00]
-; SKX-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:1.00]
+; SKX-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-SSE-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2817,7 +2809,7 @@
 ; SKX:       # %bb.0:
 ; SKX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    retq # sched: [7:1.00]
 ;
@@ -2825,8 +2817,7 @@
 ; BTVER2-SSE:       # %bb.0:
 ; BTVER2-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BTVER2-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:0.50]
-; BTVER2-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [2:1.00]
+; BTVER2-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [2:1.00]
 ; BTVER2-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
@@ -2834,7 +2825,7 @@
 ; BTVER2:       # %bb.0:
 ; BTVER2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
 ; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [3:1.00]
+; BTVER2-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [2:1.00]
 ; BTVER2-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.50]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
@@ -2842,8 +2833,7 @@
 ; ZNVER1-SSE:       # %bb.0:
 ; ZNVER1-SSE-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
 ; ZNVER1-SSE-NEXT:    addps %xmm1, %xmm0 # sched: [3:1.00]
-; ZNVER1-SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:0.50]
-; ZNVER1-SSE-NEXT:    movlps %xmm0, (%rdi) # sched: [1:0.50]
+; ZNVER1-SSE-NEXT:    movhps %xmm0, (%rdi) # sched: [1:0.50]
 ; ZNVER1-SSE-NEXT:    movaps %xmm1, %xmm0 # sched: [1:0.25]
 ; ZNVER1-SSE-NEXT:    retq # sched: [1:0.50]
 ;
@@ -2851,7 +2841,7 @@
 ; ZNVER1:       # %bb.0:
 ; ZNVER1-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
 ; ZNVER1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; ZNVER1-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [5:3.00]
+; ZNVER1-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:0.50]
 ; ZNVER1-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.25]
 ; ZNVER1-NEXT:    retq # sched: [1:0.50]
   %1 = bitcast x86_mmx* %a2 to <2 x float>*
Index: test/CodeGen/X86/vec_fptrunc.ll
===================================================================
--- test/CodeGen/X86/vec_fptrunc.ll
+++ test/CodeGen/X86/vec_fptrunc.ll
@@ -10,8 +10,7 @@
 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-SSE-NEXT:    cvtpd2ps (%ecx), %xmm0
-; X32-SSE-NEXT:    extractps $1, %xmm0, 4(%eax)
-; X32-SSE-NEXT:    movss %xmm0, (%eax)
+; X32-SSE-NEXT:    movlpd %xmm0, (%eax)
 ; X32-SSE-NEXT:    retl
 ;
 ; X32-AVX-LABEL: fptrunc_frommem2:
@@ -19,8 +18,7 @@
 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-AVX-NEXT:    vcvtpd2psx (%ecx), %xmm0
-; X32-AVX-NEXT:    vextractps $1, %xmm0, 4(%eax)
-; X32-AVX-NEXT:    vmovss %xmm0, (%eax)
+; X32-AVX-NEXT:    vmovlpd %xmm0, (%eax)
 ; X32-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: fptrunc_frommem2:
Index: test/CodeGen/X86/widen_conv-3.ll
===================================================================
--- test/CodeGen/X86/widen_conv-3.ll
+++ test/CodeGen/X86/widen_conv-3.ll
@@ -7,28 +7,15 @@
 ; sign to float v2i16 to v2f32
 
 define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind {
-; X86-SSE2-LABEL: convert_v2i16_to_v2f32:
-; X86-SSE2:       # %bb.0: # %entry
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    psllq $48, %xmm0
-; X86-SSE2-NEXT:    psrad $16, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; X86-SSE2-NEXT:    movss %xmm0, (%eax)
-; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X86-SSE2-NEXT:    movss %xmm0, 4(%eax)
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE42-LABEL: convert_v2i16_to_v2f32:
-; X86-SSE42:       # %bb.0: # %entry
-; X86-SSE42-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE42-NEXT:    psllq $48, %xmm0
-; X86-SSE42-NEXT:    psrad $16, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X86-SSE42-NEXT:    cvtdq2ps %xmm0, %xmm0
-; X86-SSE42-NEXT:    extractps $1, %xmm0, 4(%eax)
-; X86-SSE42-NEXT:    movss %xmm0, (%eax)
-; X86-SSE42-NEXT:    retl
+; X86-LABEL: convert_v2i16_to_v2f32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    psllq $48, %xmm0
+; X86-NEXT:    psrad $16, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X86-NEXT:    movlps %xmm0, (%eax)
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: convert_v2i16_to_v2f32:
 ; X64:       # %bb.0: # %entry