Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -902,8 +902,10 @@ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); // We want to legalize this to an f64 load rather than an i64 load on - // 64-bit targets and two 32-bit loads on a 32-bit target. + // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for + // store. setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::STORE, MVT::v2f32, Custom); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); @@ -19943,18 +19945,36 @@ SDValue StoredVal = St->getValue(); // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. - assert(StoredVal.getValueType().isVector() && - StoredVal.getValueType().getVectorElementType() == MVT::i1 && - StoredVal.getValueType().getVectorNumElements() <= 8 && - "Unexpected VT"); - assert(!St->isTruncatingStore() && "Expected non-truncating store"); - assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && - "Expected AVX512F without AVX512DQI"); + if (StoredVal.getValueType().isVector() && + StoredVal.getValueType().getVectorElementType() == MVT::i1) { + assert(StoredVal.getValueType().getVectorNumElements() <= 8 && + "Unexpected VT"); + assert(!St->isTruncatingStore() && "Expected non-truncating store"); + assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && + "Expected AVX512F without AVX512DQI"); + + StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), StoredVal, + DAG.getIntPtrConstant(0, dl)); + StoredVal = DAG.getBitcast(MVT::i8, StoredVal); + + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); + } + + if (St->isTruncatingStore()) + return SDValue(); + + assert(StoredVal.getValueType() == MVT::v2f32 && "Unexpected VT"); - StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, - DAG.getUNDEF(MVT::v8i1), StoredVal, + // Widen the vector, cast to a v2x64 type, extract the single 64-bit + // element and store it. + StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, StoredVal, + DAG.getUNDEF(MVT::v2f32)); + StoredVal = DAG.getBitcast(MVT::v2f64, StoredVal); + StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, StoredVal, DAG.getIntPtrConstant(0, dl)); - StoredVal = DAG.getBitcast(MVT::i8, StoredVal); return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), @@ -36912,7 +36932,8 @@ // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget.is64Bit() || F64IsLegal) { - MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; + MVT LdVT = (Subtarget.is64Bit() && + (!VT.isFloatingPoint() || !F64IsLegal)) ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); Index: llvm/trunk/test/CodeGen/X86/2011-10-19-widen_vselect.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ llvm/trunk/test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -8,8 +8,7 @@ define void @simple_widen(<2 x float> %a, <2 x float> %b) { ; X32-LABEL: simple_widen: ; X32: # %bb.0: # %entry -; X32-NEXT: extractps $1, %xmm1, (%eax) -; X32-NEXT: movss %xmm1, (%eax) +; X32-NEXT: movlps %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: simple_widen: @@ -28,8 +27,7 @@ ; X32-NEXT: movaps %xmm0, %xmm2 ; X32-NEXT: cmpordps %xmm0, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm1 -; X32-NEXT: extractps $1, %xmm1, (%eax) -; X32-NEXT: movss %xmm1, (%eax) +; X32-NEXT: movlps %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: complex_inreg_work: @@ -50,8 +48,7 @@ ; X32-LABEL: zero_test: ; X32: # %bb.0: # %entry ; X32-NEXT: xorps %xmm0, %xmm0 -; X32-NEXT: extractps $1, %xmm0, (%eax) -; X32-NEXT: movss %xmm0, (%eax) +; X32-NEXT: movlps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: zero_test: @@ -82,11 +79,8 @@ ; X32-NEXT: cmpeqps %xmm2, %xmm1 ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm4 -; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) -; X32-NEXT: movshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] -; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) -; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) -; X32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; X32-NEXT: movlps %xmm4, {{[0-9]+}}(%esp) +; X32-NEXT: movlps %xmm4, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp ; X32-NEXT: .cfi_def_cfa_offset 4 ; X32-NEXT: retl Index: llvm/trunk/test/CodeGen/X86/sse-schedule.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse-schedule.ll +++ llvm/trunk/test/CodeGen/X86/sse-schedule.ll @@ -2712,8 +2712,7 @@ ; GENERIC: # %bb.0: ; GENERIC-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] ; GENERIC-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; GENERIC-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; GENERIC-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; GENERIC-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2723,16 +2722,14 @@ ; ATOM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00] ; ATOM-NEXT: addps %xmm1, %xmm2 # sched: [5:5.00] ; ATOM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] -; ATOM-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] sched: [1:1.00] -; ATOM-NEXT: movlps %xmm2, (%rdi) # sched: [1:1.00] +; ATOM-NEXT: movhps %xmm2, (%rdi) # sched: [1:1.00] ; ATOM-NEXT: retq # sched: [79:39.50] ; ; SLM-LABEL: test_movhps: ; SLM: # %bb.0: ; SLM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] ; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; SLM-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; SLM-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; SLM-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; @@ -2740,8 +2737,7 @@ ; SANDY-SSE: # %bb.0: ; SANDY-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] ; SANDY-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; SANDY-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; SANDY-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; SANDY-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] ; SANDY-SSE-NEXT: retq # sched: [1:1.00] ; @@ -2749,7 +2745,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] ; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; SANDY-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:1.00] +; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; SANDY-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; SANDY-NEXT: retq # sched: [1:1.00] ; @@ -2757,8 +2753,7 @@ ; HASWELL-SSE: # %bb.0: ; HASWELL-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; HASWELL-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; HASWELL-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; HASWELL-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; HASWELL-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] ; HASWELL-SSE-NEXT: retq # sched: [7:1.00] ; @@ -2766,7 +2761,7 @@ ; HASWELL: # %bb.0: ; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; HASWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00] +; HASWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; HASWELL-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; HASWELL-NEXT: retq # sched: [7:1.00] ; @@ -2774,8 +2769,7 @@ ; BROADWELL-SSE: # %bb.0: ; BROADWELL-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BROADWELL-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; BROADWELL-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; BROADWELL-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; BROADWELL-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; BROADWELL-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] ; BROADWELL-SSE-NEXT: retq # sched: [7:1.00] ; @@ -2783,7 +2777,7 @@ ; BROADWELL: # %bb.0: ; BROADWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BROADWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; BROADWELL-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00] +; BROADWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; BROADWELL-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; @@ -2791,8 +2785,7 @@ ; SKYLAKE-SSE: # %bb.0: ; SKYLAKE-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; SKYLAKE-SSE-NEXT: addps %xmm1, %xmm0 # sched: [4:0.50] -; SKYLAKE-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; SKYLAKE-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; SKYLAKE-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-SSE-NEXT: retq # sched: [7:1.00] ; @@ -2800,7 +2793,7 @@ ; SKYLAKE: # %bb.0: ; SKYLAKE-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; SKYLAKE-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] -; SKYLAKE-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00] +; SKYLAKE-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; SKYLAKE-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKYLAKE-NEXT: retq # sched: [7:1.00] ; @@ -2808,8 +2801,7 @@ ; SKX-SSE: # %bb.0: ; SKX-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; SKX-SSE-NEXT: addps %xmm1, %xmm0 # sched: [4:0.50] -; SKX-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:1.00] -; SKX-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:1.00] +; SKX-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:1.00] ; SKX-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-SSE-NEXT: retq # sched: [7:1.00] ; @@ -2817,7 +2809,7 @@ ; SKX: # %bb.0: ; SKX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50] -; SKX-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00] +; SKX-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] ; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.33] ; SKX-NEXT: retq # sched: [7:1.00] ; @@ -2825,8 +2817,7 @@ ; BTVER2-SSE: # %bb.0: ; BTVER2-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BTVER2-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; BTVER2-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:0.50] -; BTVER2-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [2:1.00] +; BTVER2-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [2:1.00] ; BTVER2-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-SSE-NEXT: retq # sched: [4:1.00] ; @@ -2834,7 +2825,7 @@ ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] ; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [3:1.00] +; BTVER2-NEXT: vmovhpd %xmm0, (%rdi) # sched: [2:1.00] ; BTVER2-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.50] ; BTVER2-NEXT: retq # sched: [4:1.00] ; @@ -2842,8 +2833,7 @@ ; ZNVER1-SSE: # %bb.0: ; ZNVER1-SSE-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] ; ZNVER1-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] -; ZNVER1-SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] sched: [1:0.50] -; ZNVER1-SSE-NEXT: movlps %xmm0, (%rdi) # sched: [1:0.50] +; ZNVER1-SSE-NEXT: movhps %xmm0, (%rdi) # sched: [1:0.50] ; ZNVER1-SSE-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.25] ; ZNVER1-SSE-NEXT: retq # sched: [1:0.50] ; @@ -2851,7 +2841,7 @@ ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] ; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [5:3.00] +; ZNVER1-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:0.50] ; ZNVER1-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = bitcast x86_mmx* %a2 to <2 x float>* Index: llvm/trunk/test/CodeGen/X86/vec_fptrunc.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_fptrunc.ll +++ llvm/trunk/test/CodeGen/X86/vec_fptrunc.ll @@ -10,8 +10,7 @@ ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-SSE-NEXT: cvtpd2ps (%ecx), %xmm0 -; X32-SSE-NEXT: extractps $1, %xmm0, 4(%eax) -; X32-SSE-NEXT: movss %xmm0, (%eax) +; X32-SSE-NEXT: movlpd %xmm0, (%eax) ; X32-SSE-NEXT: retl ; ; X32-AVX-LABEL: fptrunc_frommem2: @@ -19,8 +18,7 @@ ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX-NEXT: vcvtpd2psx (%ecx), %xmm0 -; X32-AVX-NEXT: vextractps $1, %xmm0, 4(%eax) -; X32-AVX-NEXT: vmovss %xmm0, (%eax) +; X32-AVX-NEXT: vmovlpd %xmm0, (%eax) ; X32-AVX-NEXT: retl ; ; X64-SSE-LABEL: fptrunc_frommem2: Index: llvm/trunk/test/CodeGen/X86/widen_conv-3.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widen_conv-3.ll +++ llvm/trunk/test/CodeGen/X86/widen_conv-3.ll @@ -7,28 +7,15 @@ ; sign to float v2i16 to v2f32 define void @convert_v2i16_to_v2f32(<2 x float>* %dst.addr, <2 x i16> %src) nounwind { -; X86-SSE2-LABEL: convert_v2i16_to_v2f32: -; X86-SSE2: # %bb.0: # %entry -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: psllq $48, %xmm0 -; X86-SSE2-NEXT: psrad $16, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; X86-SSE2-NEXT: movss %xmm0, (%eax) -; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X86-SSE2-NEXT: movss %xmm0, 4(%eax) -; X86-SSE2-NEXT: retl -; -; X86-SSE42-LABEL: convert_v2i16_to_v2f32: -; X86-SSE42: # %bb.0: # %entry -; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE42-NEXT: psllq $48, %xmm0 -; X86-SSE42-NEXT: psrad $16, %xmm0 -; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 -; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax) -; X86-SSE42-NEXT: movss %xmm0, (%eax) -; X86-SSE42-NEXT: retl +; X86-LABEL: convert_v2i16_to_v2f32: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: psllq $48, %xmm0 +; X86-NEXT: psrad $16, %xmm0 +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X86-NEXT: cvtdq2ps %xmm0, %xmm0 +; X86-NEXT: movlps %xmm0, (%eax) +; X86-NEXT: retl ; ; X64-LABEL: convert_v2i16_to_v2f32: ; X64: # %bb.0: # %entry