Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7589,11 +7589,19 @@ return NewLd; }; - // LOAD - all consecutive load/undefs (must start/end with a load). - // If we have found an entire vector of loads and undefs, then return a large - // load of the entire vector width starting at the base pointer. - // If the vector contains zeros, then attempt to shuffle those elements. - if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) && + // Check if the base load is entirely dereferenceable. + bool IsDereferenceable = + LDBase && + LDBase->getPointerInfo().isDereferenceable( + VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); + + // LOAD - all consecutive load/undefs (must start/end with a load or be + // entirely dereferenceable). If we have found an entire vector of loads and + // undefs, then return a large load of the entire vector width starting at the + // base pointer. If the vector contains zeros, then attempt to shuffle those + // elements. + if (FirstLoadedElt == 0 && + (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) && (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { assert(LDBase && "Did not find base load for merging consecutive loads"); EVT EltVT = LDBase->getValueType(0); @@ -7614,12 +7622,12 @@ if (NumElems == 1) return DAG.getBitcast(VT, Elts[FirstLoadedElt]); - if (IsConsecutiveLoad) + if (IsConsecutiveLoad && LastLoadedElt == (int)(NumElems - 1)) return CreateLoad(VT, LDBase); // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded // vector and a zero vector to clear out the zero elements. - if (!isAfterLegalize && VT.isVector() && NumElems == VT.getVectorNumElements()) { + if (!isAfterLegalize && VT.isVector()) { SmallVector ClearMask(NumElems, -1); for (unsigned i = 0; i < NumElems; ++i) { if (ZeroMask[i]) Index: test/CodeGen/X86/load-partial.ll =================================================================== --- test/CodeGen/X86/load-partial.ll +++ test/CodeGen/X86/load-partial.ll @@ -10,30 +10,14 @@ ; define <4 x float> @load_float4_float3(<4 x float>* nocapture readonly dereferenceable(16)) { -; SSE2-LABEL: load_float4_float3: -; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_float4_float3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_float4_float3: -; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: retq +; SSE-LABEL: load_float4_float3: +; SSE: # %bb.0: +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: load_float4_float3: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: retq %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0 %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1 @@ -48,30 +32,14 @@ } define <8 x float> @load_float8_float3(<4 x float>* nocapture readonly dereferenceable(16)) { -; SSE2-LABEL: load_float8_float3: -; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_float8_float3: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_float8_float3: -; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] -; SSE41-NEXT: retq +; SSE-LABEL: load_float8_float3: +; SSE: # %bb.0: +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: load_float8_float3: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovups (%rdi), %xmm0 ; AVX-NEXT: retq %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0 %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1 @@ -199,10 +167,7 @@ ; ; AVX-LABEL: load_double4_0u2u: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] +; AVX-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2] ; AVX-NEXT: retq %2 = load double, double* %0, align 8 %3 = insertelement <4 x double> undef, double %2, i32 0