Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -902,6 +902,10 @@ for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); + // We want to legalize this to an f64 load rather than an i64 load on + // 64-bit targets and two 32-bit loads on a 32-bit target. + setOperationAction(ISD::LOAD, MVT::v2f32, Custom); + setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); @@ -26420,6 +26424,26 @@ } break; } + case ISD::LOAD: { + // Use an f64 load and a scalar_to_vector for v2f32 loads. This avoids + // scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp cast + // since type legalization will try to use an i64 load. + EVT VT = N->getValueType(0); + assert(VT == MVT::v2f32 && "Unexpected VT"); + if (!ISD::isNON_EXTLoad(N)) + return; + auto *Ld = cast(N); + SDValue Res = DAG.getLoad(MVT::f64, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), + Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + SDValue Chain = Res.getValue(1); + Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Res); + Res = DAG.getBitcast(MVT::v4f32, Res); + Results.push_back(Res); + Results.push_back(Chain); + return; + } } } Index: llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector.ll +++ llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector.ll @@ -17,8 +17,10 @@ ; ; X86-SSE-LABEL: foo: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: ucomiss {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-SSE-NEXT: movaps %xmm0, %xmm1 +; X86-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] +; X86-SSE-NEXT: ucomiss %xmm1, %xmm0 ; X86-SSE-NEXT: setp %al ; X86-SSE-NEXT: retl ; Index: llvm/trunk/test/CodeGen/X86/fold-load-vec.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/fold-load-vec.ll +++ llvm/trunk/test/CodeGen/X86/fold-load-vec.ll @@ -16,8 +16,8 @@ ; CHECK-NEXT: movlps %xmm0, (%rsp) ; CHECK-NEXT: movlps %xmm0, (%rsi) ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax -; CHECK-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; CHECK-NEXT: callq ext ; CHECK-NEXT: addq $24, %rsp ; CHECK-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-256.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -237,33 +237,35 @@ define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noinline ssp { ; AVX1-LABEL: merge_8f32_2f32_23z5: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vmovups 16(%rdi), %xmm1 -; AVX1-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vmovups 16(%rdi), %xmm0 +; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: merge_8f32_2f32_23z5: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vmovupd 16(%rdi), %xmm0 +; AVX2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: merge_8f32_2f32_23z5: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX512F-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: vmovupd 16(%rdi), %xmm0 +; AVX512F-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq ; ; X32-AVX-LABEL: merge_8f32_2f32_23z5: ; X32-AVX: # %bb.0: ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; X32-AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5],mem[6,7] +; X32-AVX-NEXT: vmovups 16(%eax), %xmm0 +; X32-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X32-AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2 %ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3 Index: llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ llvm/trunk/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1329,19 +1329,15 @@ ; X86-AVX1-LABEL: test_mm_loadh_pi: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08] -; X86-AVX1-NEXT: # xmm1 = mem[0],zero -; X86-AVX1-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1] -; X86-AVX1-NEXT: # xmm0 = xmm0[0],xmm1[0] +; X86-AVX1-NEXT: vmovhpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x16,0x00] +; X86-AVX1-NEXT: # xmm0 = xmm0[0],mem[0] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_loadh_pi: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08] -; X86-AVX512-NEXT: # xmm1 = mem[0],zero -; X86-AVX512-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] -; X86-AVX512-NEXT: # xmm0 = xmm0[0],xmm1[0] +; X86-AVX512-NEXT: vmovhpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x16,0x00] +; X86-AVX512-NEXT: # xmm0 = xmm0[0],mem[0] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_loadh_pi: @@ -1396,19 +1392,15 @@ ; X86-AVX1-LABEL: test_mm_loadl_pi: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08] -; X86-AVX1-NEXT: # xmm1 = mem[0],zero -; X86-AVX1-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03] -; X86-AVX1-NEXT: # xmm0 = xmm1[0,1],xmm0[2,3] +; X86-AVX1-NEXT: vmovlpd (%eax), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x12,0x00] +; X86-AVX1-NEXT: # xmm0 = mem[0],xmm0[1] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_loadl_pi: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08] -; X86-AVX512-NEXT: # xmm1 = mem[0],zero -; X86-AVX512-NEXT: vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03] -; X86-AVX512-NEXT: # xmm0 = xmm1[0,1],xmm0[2,3] +; X86-AVX512-NEXT: vmovlpd (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x12,0x00] +; X86-AVX512-NEXT: # xmm0 = mem[0],xmm0[1] ; X86-AVX512-NEXT: retl # encoding: [0xc3] ; ; X64-SSE-LABEL: test_mm_loadl_pi: Index: llvm/trunk/test/CodeGen/X86/vec_extract-avx.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_extract-avx.ll +++ llvm/trunk/test/CodeGen/X86/vec_extract-avx.ll @@ -171,7 +171,9 @@ ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X32-NEXT: vmovaps %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1998,8 +1998,8 @@ define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) { ; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] ; SSE2-NEXT: retq ; ; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32: Index: llvm/trunk/test/CodeGen/X86/widen_load-1.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widen_load-1.ll +++ llvm/trunk/test/CodeGen/X86/widen_load-1.ll @@ -5,11 +5,11 @@ ; This load should be before the call, not after. -; SSE: movaps compl+128(%rip), %xmm0 +; SSE: movsd compl+128(%rip), %xmm0 ; SSE: movaps %xmm0, (%rsp) ; SSE: callq killcommon -; AVX: vmovaps compl+128(%rip), %xmm0 +; AVX: vmovsd compl+128(%rip), %xmm0 ; AVX: vmovaps %xmm0, (%rsp) ; AVX: callq killcommon