Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16803,18 +16803,20 @@ // Try to turn a build vector of zero extends of extract vector elts into a // a vector zero extend and possibly an extract subvector. -// TODO: Support sign extend or any extend? +// TODO: Support sign extend? // TODO: Allow undef elements? -// TODO: Don't require the extracts to start at element 0. SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) { if (LegalOperations) return SDValue(); EVT VT = N->getValueType(0); + bool FoundZeroExtend = false; SDValue Op0 = N->getOperand(0); auto checkElem = [&](SDValue Op) -> int64_t { - if (Op.getOpcode() == ISD::ZERO_EXTEND && + unsigned Opc = Op.getOpcode(); + FoundZeroExtend |= (Opc == ISD::ZERO_EXTEND); + if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND) && Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0)) if (auto *C = dyn_cast(Op.getOperand(0).getOperand(1))) @@ -16846,7 +16848,8 @@ SDLoc DL(N); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In, Op0.getOperand(0).getOperand(1)); - return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, In); + return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL, + VT, In); } SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -557,6 +557,40 @@ Known.Zero &= Known2.Zero; } return false; // Don't fall through, will infinitely loop. + case ISD::INSERT_VECTOR_ELT: { + SDValue Vec = Op.getOperand(0); + SDValue Scl = Op.getOperand(1); + auto *CIdx = dyn_cast(Op.getOperand(2)); + EVT VecVT = Vec.getValueType(); + + // If index isn't constant, assume we need all vector elements AND the + // inserted element. + APInt DemandedVecElts(OriginalDemandedElts); + if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) { + unsigned Idx = CIdx->getZExtValue(); + DemandedVecElts.clearBit(Idx); + + // Inserted element is not required. + if (!OriginalDemandedElts[Idx]) + return TLO.CombineTo(Op, Vec); + } + + if (!!DemandedVecElts) + if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts, + Known, TLO, Depth + 1)) + return true; + + KnownBits KnownScl; + unsigned NumSclBits = Scl.getScalarValueSizeInBits(); + APInt DemandedSclBits = OriginalDemandedBits.zextOrTrunc(NumSclBits); + if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1)) + return true; + + KnownScl = KnownScl.zextOrTrunc(BitWidth, false); + Known.One &= KnownScl.One; + Known.Zero &= KnownScl.Zero; + return false; + } case ISD::CONCAT_VECTORS: { Known.Zero.setAllBits(); Known.One.setAllBits(); Index: test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- test/CodeGen/X86/known-signbits-vector.ll +++ test/CodeGen/X86/known-signbits-vector.ll @@ -144,9 +144,9 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: shrdl $30, %ecx, %eax +; X32-NEXT: movl %eax, %ecx ; X32-NEXT: sarl $30, %ecx +; X32-NEXT: shll $2, %eax ; X32-NEXT: vmovd %eax, %xmm0 ; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ; X32-NEXT: vpsrlq $3, %xmm0, %xmm0 Index: test/CodeGen/X86/mulvi32.ll =================================================================== --- test/CodeGen/X86/mulvi32.ll +++ test/CodeGen/X86/mulvi32.ll @@ -312,18 +312,10 @@ ; %ext0 = zext <2 x i32> %0 to <2 x i64> ; %ext1 = zext <2 x i32> %1 to <2 x i64> define <2 x i64> @_mul2xi64toi64a(<2 x i64>, <2 x i64>) { -; SSE2-LABEL: _mul2xi64toi64a: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE42-LABEL: _mul2xi64toi64a: -; SSE42: # %bb.0: -; SSE42-NEXT: pmuludq %xmm1, %xmm0 -; SSE42-NEXT: retq +; SSE-LABEL: _mul2xi64toi64a: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq %xmm1, %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: _mul2xi64toi64a: ; AVX: # %bb.0: Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1736,7 +1736,7 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) { ; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movsbl (%rdi), %eax +; SSE2-NEXT: movzbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] @@ -1745,7 +1745,7 @@ ; ; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsbl (%rdi), %eax +; SSSE3-NEXT: movzbl (%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: pxor %xmm1, %xmm1 ; SSSE3-NEXT: pshufb %xmm1, %xmm0 @@ -1753,7 +1753,7 @@ ; ; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movsbl (%rdi), %eax +; SSE41-NEXT: movzbl (%rdi), %eax ; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm1 ; SSE41-NEXT: pshufb %xmm1, %xmm0 @@ -1761,7 +1761,7 @@ ; ; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8: ; AVX1: # %bb.0: -; AVX1-NEXT: movsbl (%rdi), %eax +; AVX1-NEXT: movzbl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 Index: test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v8.ll +++ test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2652,7 +2652,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) { ; SSE-LABEL: insert_dup_mem_v8i16_sext_i16: ; SSE: # %bb.0: -; SSE-NEXT: movswl (%rdi), %eax +; SSE-NEXT: movzwl (%rdi), %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] @@ -2660,7 +2660,7 @@ ; ; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16: ; AVX1: # %bb.0: -; AVX1-NEXT: movswl (%rdi), %eax +; AVX1-NEXT: movzwl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] @@ -2668,14 +2668,14 @@ ; ; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: movswl (%rdi), %eax +; AVX2-NEXT: movzwl (%rdi), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movswl (%rdi), %eax +; AVX512VL-NEXT: movzwl (%rdi), %eax ; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 ; AVX512VL-NEXT: retq %tmp = load i16, i16* %ptr, align 2 Index: test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v16.ll +++ test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4722,7 +4722,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) { ; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16: ; AVX1: # %bb.0: -; AVX1-NEXT: movswl (%rdi), %eax +; AVX1-NEXT: movzwl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] @@ -4731,14 +4731,14 @@ ; ; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: movswl (%rdi), %eax +; AVX2-NEXT: movzwl (%rdi), %eax ; AVX2-NEXT: vmovd %eax, %xmm0 ; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movswl (%rdi), %eax +; AVX512VL-NEXT: movzwl (%rdi), %eax ; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 ; AVX512VL-NEXT: retq %tmp = load i16, i16* %ptr, align 2 Index: test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v32.ll +++ test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -3154,7 +3154,7 @@ define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) { ; AVX1-LABEL: insert_dup_mem_v32i8_sext_i8: ; AVX1: # %bb.0: -; AVX1-NEXT: movsbl (%rdi), %eax +; AVX1-NEXT: movzbl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -233,7 +233,7 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) { ; KNL-LABEL: insert_dup_mem_v32i16_sext_i16: ; KNL: ## %bb.0: -; KNL-NEXT: movswl (%rdi), %eax +; KNL-NEXT: movzwl (%rdi), %eax ; KNL-NEXT: vmovd %eax, %xmm0 ; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 ; KNL-NEXT: vmovdqa %ymm0, %ymm1 @@ -241,7 +241,7 @@ ; ; SKX-LABEL: insert_dup_mem_v32i16_sext_i16: ; SKX: ## %bb.0: -; SKX-NEXT: movswl (%rdi), %eax +; SKX-NEXT: movzwl (%rdi), %eax ; SKX-NEXT: vpbroadcastw %eax, %zmm0 ; SKX-NEXT: retq %tmp = load i16, i16* %ptr, align 2