Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7677,7 +7677,8 @@ ZeroMask.setBit(i); continue; } - + if (!ISD::isNON_EXTLoad(Elt.getNode())) + return SDValue(); // Each loaded element must be the correct fractional portion of the // requested vector load. unsigned EltSizeInBits = Elt.getValueSizeInBits(); Index: llvm/test/CodeGen/X86/load-partial.ll =================================================================== --- llvm/test/CodeGen/X86/load-partial.ll +++ llvm/test/CodeGen/X86/load-partial.ll @@ -108,14 +108,33 @@ } define <4 x float> @load_float4_float3_as_float2_float(<4 x float>* nocapture readonly dereferenceable(16)) { -; SSE-LABEL: load_float4_float3_as_float2_float: -; SSE: # %bb.0: -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: load_float4_float3_as_float2_float: +; SSE2: # %bb.0: +; SSE2-NEXT: movsd (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE2-NEXT: movss 8(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps $48, %xmm0, %xmm1 # xmm1 = xmm1[0,0],xmm0[3,0] +; SSE2-NEXT: shufps $132, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_float4_float3_as_float2_float: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movsd (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSSE3-NEXT: movss 8(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps $48, %xmm0, %xmm1 # xmm1 = xmm1[0,0],xmm0[3,0] +; SSSE3-NEXT: shufps $132, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,2] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_float4_float3_as_float2_float: +; SSE41: # %bb.0: +; SSE41-NEXT: movsd (%rdi), %xmm0 # xmm0 = mem[0],zero +; SSE41-NEXT: insertps $32, 8(%rdi), %xmm0 # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: retq + ; ; AVX-LABEL: load_float4_float3_as_float2_float: ; AVX: # %bb.0: -; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovsd (%rdi), %xmm0 # xmm0 = mem[0],zero +; AVX-NEXT: vinsertps $32, 8(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to <2 x float>* %3 = load <2 x float>, <2 x float>* %2, align 4 @@ -157,14 +176,37 @@ } define <4 x float> @load_float4_float3_trunc(<4 x float>* nocapture readonly dereferenceable(16)) { -; SSE-LABEL: load_float4_float3_trunc: -; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: load_float4_float3_trunc: +; SSE2: # %bb.0: +; SSE2-NEXT: movss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 4(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movss 8(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movlhps %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_float4_float3_trunc: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss 4(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: unpcklps %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movss 8(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movlhps %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_float4_float3_trunc: +; SSE41: # %bb.0: +; SSE41-NEXT: movss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps $16, 4(%rdi), %xmm0 # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; SSE41-NEXT: insertps $32, 8(%rdi), %xmm0 # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: retq + ; ; AVX-LABEL: load_float4_float3_trunc: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 +; AVX-NEXT: vmovss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps $16, 4(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vinsertps $32, 8(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to i64* %3 = load i64, i64* %2, align 16 @@ -185,18 +227,38 @@ } define <4 x float> @load_float4_float3_trunc_0122(<4 x float>* nocapture readonly dereferenceable(16)) { -; SSE-LABEL: load_float4_float3_trunc_0122: -; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] -; SSE-NEXT: retq +; SSE2-LABEL: load_float4_float3_trunc_0122: +; SSE2: # %bb.0: +; SSE2-NEXT: movss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 4(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movss 8(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps $4, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,0] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: load_float4_float3_trunc_0122: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss 4(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: unpcklps %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movss 8(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps $4, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: load_float4_float3_trunc_0122: +; SSE41: # %bb.0: +; SSE41-NEXT: movss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movss 8(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps $16, 4(%rdi), %xmm0 # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; SSE41-NEXT: shufps $4, %xmm1, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,0] +; SSE41-NEXT: retq ; ; AVX-LABEL: load_float4_float3_trunc_0122: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovaps (%rdi), %xmm1 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] +; AVX-NEXT: vmovss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss 8(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps $16, 4(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vshufps $4, %xmm1, %xmm0, %xmm0 # xmm0 = xmm0[0,1],xmm1[0,0] ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to i64* %3 = load i64, i64* %2, align 16 @@ -220,28 +282,40 @@ define <4 x float> @load_float4_float3_trunc_0123(<4 x float>* nocapture readonly dereferenceable(16)) { ; SSE2-LABEL: load_float4_float3_trunc_0123: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rdi), %xmm0 -; SSE2-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSE2-NEXT: movss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 4(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movss 8(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 12(%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps %xmm2, %xmm1 # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_float4_float3_trunc_0123: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movaps (%rdi), %xmm0 -; SSSE3-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; SSSE3-NEXT: movss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss 4(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: unpcklps %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movss 8(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss 12(%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero +; SSSE3-NEXT: unpcklps %xmm2, %xmm1 # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSSE3-NEXT: movlhps %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_float4_float3_trunc_0123: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps (%rdi), %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; SSE41-NEXT: movss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps $16, 4(%rdi), %xmm0 # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; SSE41-NEXT: insertps $32, 8(%rdi), %xmm0 # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: insertps $48, 12(%rdi), %xmm0 # xmm0 = xmm0[0,1,2],mem[0] ; SSE41-NEXT: retq ; ; AVX-LABEL: load_float4_float3_trunc_0123: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; AVX-NEXT: vmovss (%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps $16, 4(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vinsertps $32, 8(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vinsertps $48, 12(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1,2],mem[0] ; AVX-NEXT: retq %2 = bitcast <4 x float>* %0 to i64* %3 = load i64, i64* %2, align 16