Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5639,11 +5639,11 @@ (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits(); // VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs. - // TODO: The code below fires only for for loading the low 64-bits of a - // of a 128-bit vector. It's probably worth generalizing more. if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 && - (VT.is128BitVector() && TLI.isTypeLegal(MVT::v2i64))) { - SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); + ((VT.is128BitVector() && TLI.isTypeLegal(MVT::v2i64)) || + (VT.is256BitVector() && TLI.isTypeLegal(MVT::v4i64)))) { + EVT VecVT = VT.is256BitVector() ? MVT::v4i64 : MVT::v2i64; + SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -682,11 +682,6 @@ return cast(N)->getAlignment() >= 16; }]>; -// Like 'X86vzload', but always requires 128-bit vector alignment. -def alignedX86vzload : PatFrag<(ops node:$ptr), (X86vzload node:$ptr), [{ - return cast(N)->getAlignment() >= 16; -}]>; - // Like 'load', but always requires 256-bit vector alignment. def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ return cast(N)->getAlignment() >= 32; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -5058,6 +5058,8 @@ def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; + def : Pat<(v4i64 (X86vzload addr:$src)), + (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrm addr:$src), sub_xmm)>; } let Predicates = [UseSSE2], AddedComplexity = 20 in { @@ -5066,13 +5068,6 @@ def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>; } -let Predicates = [HasAVX] in { -def : Pat<(v4i64 (alignedX86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVAPSrm addr:$src), sub_xmm)>; -def : Pat<(v4i64 (X86vzload addr:$src)), - (SUBREG_TO_REG (i32 0), (VMOVUPSrm addr:$src), sub_xmm)>; -} - //===---------------------------------------------------------------------===// // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in // IA32 document. movq xmm1, xmm2 does clear the high bits. Index: test/CodeGen/X86/merge-consecutive-loads-256.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-256.ll +++ test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -214,19 +214,10 @@ } define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinline ssp { -; AVX1-LABEL: merge_8f32_f32_12zzuuzz: -; AVX1: # BB#0: -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: merge_8f32_f32_12zzuuzz: -; AVX2: # BB#0: -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: merge_8f32_f32_12zzuuzz: +; AVX: # BB#0: +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: retq %ptr0 = getelementptr inbounds float, float* %ptr, i64 1 %ptr1 = getelementptr inbounds float, float* %ptr, i64 2 %val0 = load float, float* %ptr0