Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5601,6 +5601,24 @@ } } + auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) { + SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), + LDBase->getBasePtr(), LDBase->getPointerInfo(), + LDBase->isVolatile(), LDBase->isNonTemporal(), + LDBase->isInvariant(), LDBase->getAlignment()); + + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = + DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(NewLd.getNode(), 1)); + } + + return NewLd; + }; + // LOAD - all consecutive load/undefs (must start/end with a load). // If we have found an entire vector of loads and undefs, then return a large // load of the entire vector width starting at the base pointer. @@ -5616,23 +5634,7 @@ if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); - SDValue NewLd = SDValue(); - - NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), - LDBase->getPointerInfo(), LDBase->isVolatile(), - LDBase->isNonTemporal(), LDBase->isInvariant(), - LDBase->getAlignment()); - - if (LDBase->hasAnyUseOfValue(1)) { - SDValue NewChain = - DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), - SDValue(NewLd.getNode(), 1)); - DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); - DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), - SDValue(NewLd.getNode(), 1)); - } - - return NewLd; + return CreateLoad(VT, LDBase); } int LoadSize = @@ -5667,6 +5669,19 @@ return DAG.getBitcast(VT, ResNode); } + + // VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs. + if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 && + ((VT.is128BitVector() && TLI.isTypeLegal(MVT::v4i32)) || + (VT.is256BitVector() && TLI.isTypeLegal(MVT::v8i32)) || + (VT.is512BitVector() && TLI.isTypeLegal(MVT::v16i32)))) { + MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); + SDValue V = CreateLoad(MVT::i32, LDBase); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V); + return DAG.getBitcast(VT, V); + } + return SDValue(); } Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3046,6 +3046,18 @@ def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; + + // Represent the same patterns above but in the form they appear for + // 512-bit types + def : Pat<(v16i32 (X86vzmovl (insert_subvector undef, + (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>; + def : Pat<(v16f32 (X86vzmovl (insert_subvector undef, + (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>; + def : Pat<(v8f64 (X86vzmovl (insert_subvector undef, + (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; } def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))), Index: test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-128.ll +++ test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -327,14 +327,12 @@ define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline ssp { ; SSE-LABEL: merge_8i16_i16_34uuuuuu: ; SSE: # BB#0: -; SSE-NEXT: pinsrw $0, 6(%rdi), %xmm0 -; SSE-NEXT: pinsrw $1, 8(%rdi), %xmm0 +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: merge_8i16_i16_34uuuuuu: ; AVX: # BB#0: -; AVX-NEXT: vpinsrw $0, 6(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $1, 8(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 3 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 4 @@ -427,33 +425,14 @@ } define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp { -; SSE2-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: -; SSE2: # BB#0: -; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movzbl 1(%rdi), %ecx -; SSE2-NEXT: shll $8, %ecx -; SSE2-NEXT: orl %eax, %ecx -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pinsrw $0, %ecx, %xmm0 -; SSE2-NEXT: movzbl 3(%rdi), %eax -; SSE2-NEXT: shll $8, %eax -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: -; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0 -; SSE41-NEXT: pinsrb $1, 1(%rdi), %xmm0 -; SSE41-NEXT: pinsrb $3, 3(%rdi), %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: +; SSE: # BB#0: +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: retq ; ; AVX-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1 Index: test/CodeGen/X86/merge-consecutive-loads-256.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-256.ll +++ test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -401,29 +401,10 @@ } define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(i16* %ptr) nounwind uwtable noinline ssp { -; AVX1-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpinsrw $1, 18(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm1 -; AVX2-NEXT: vpinsrw $1, 18(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrw $0, 16(%rdi), %xmm0, %xmm1 -; AVX512F-NEXT: vpinsrw $1, 18(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: retq +; AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz: +; AVX: # BB#0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: retq %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 8 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 9 %val0 = load i16, i16* %ptr0 @@ -531,9 +512,7 @@ define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp { ; AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: ; AVX: # BB#0: -; AVX-NEXT: vpinsrb $0, 4(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $1, 5(%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $3, 7(%rdi), %xmm0, %xmm0 +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 4 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 5 @@ -548,32 +527,10 @@ } define <32 x i8> @merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu(i8* %ptr) nounwind uwtable noinline ssp { -; AVX1-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: -; AVX1: # BB#0: -; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $0, 2(%rdi), %xmm0, %xmm1 -; AVX1-NEXT: vpinsrb $1, 3(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vpinsrb $3, 5(%rdi), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: -; AVX2: # BB#0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $0, 2(%rdi), %xmm0, %xmm1 -; AVX2-NEXT: vpinsrb $1, 3(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vpinsrb $3, 5(%rdi), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: -; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $0, 2(%rdi), %xmm0, %xmm1 -; AVX512F-NEXT: vpinsrb $1, 3(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrb $3, 5(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: retq +; AVX-LABEL: merge_32i8_i8_23u5uuuuuuuuuuzzzzuuuuuuuuuuuuuu: +; AVX: # BB#0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: retq %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 2 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 3 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 5 Index: test/CodeGen/X86/merge-consecutive-loads-512.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-512.ll +++ test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -410,21 +410,13 @@ define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp { ; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm1 -; AVX512F-NEXT: vpinsrw $1, 6(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vpinsrw $0, 4(%rdi), %xmm0, %xmm1 -; AVX512BW-NEXT: vpinsrw $1, 6(%rdi), %xmm1, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512BW-NEXT: retq %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3 @@ -474,23 +466,13 @@ define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp { ; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; AVX512F: # BB#0: -; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpinsrb $0, 1(%rdi), %xmm0, %xmm1 -; AVX512F-NEXT: vpinsrb $1, 2(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vpinsrb $3, 4(%rdi), %xmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: ; AVX512BW: # BB#0: -; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX512BW-NEXT: vpinsrb $0, 1(%rdi), %xmm0, %xmm1 -; AVX512BW-NEXT: vpinsrb $1, 2(%rdi), %xmm1, %xmm1 -; AVX512BW-NEXT: vpinsrb $3, 4(%rdi), %xmm1, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX512BW-NEXT: retq %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2