Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15698,8 +15698,13 @@ if (CstElt->getAPIntValue().ult(VT.getVectorNumElements())) DemandedElts.setBit(CstElt->getZExtValue()); } - if (SimplifyDemandedVectorElts(InVec, DemandedElts, true)) + if (SimplifyDemandedVectorElts(InVec, DemandedElts, true)) { + // We simplified the operands of this extract element, but if this extract + // is still alive, visit it again to ensure that it is folded properly. + if (!N->use_empty()) + AddToWorklist(N); return SDValue(N, 0); + } } bool BCNumEltsChanged = false; Index: llvm/trunk/test/CodeGen/X86/extractelement-load.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/extractelement-load.ll +++ llvm/trunk/test/CodeGen/X86/extractelement-load.ll @@ -93,8 +93,7 @@ ; ; X64-SSSE3-LABEL: t4: ; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-SSSE3-NEXT: movq %xmm0, %rax +; X64-SSSE3-NEXT: movq (%rdi), %rax ; X64-SSSE3-NEXT: retq ; ; X64-AVX-LABEL: t4: Index: llvm/trunk/test/CodeGen/X86/insertps-combine.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/insertps-combine.ll +++ llvm/trunk/test/CodeGen/X86/insertps-combine.ll @@ -284,13 +284,12 @@ define float @extract_lane_insertps_6123(<4 x float> %a0, <4 x float> *%p1) { ; SSE-LABEL: extract_lane_insertps_6123: ; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract_lane_insertps_6123: ; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX-NEXT: retq %a1 = load <4 x float>, <4 x float> *%p1 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128) Index: llvm/trunk/test/CodeGen/X86/masked_load.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_load.ll +++ llvm/trunk/test/CodeGen/X86/masked_load.ll @@ -556,7 +556,7 @@ ; SSE2-NEXT: movss {{.*#+}} xmm5 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3] ; SSE2-NEXT: LBB6_2: ## %else -; SSE2-NEXT: packssdw %xmm0, %xmm4 +; SSE2-NEXT: psrlq $16, %xmm4 ; SSE2-NEXT: movd %xmm4, %eax ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: testb $1, %al Index: llvm/trunk/test/CodeGen/X86/masked_store.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_store.ll +++ llvm/trunk/test/CodeGen/X86/masked_store.ll @@ -145,7 +145,7 @@ ; SSE2-NEXT: ## %bb.1: ## %cond.store ; SSE2-NEXT: movd %xmm2, (%rdi) ; SSE2-NEXT: LBB2_2: ## %else -; SSE2-NEXT: packssdw %xmm0, %xmm4 +; SSE2-NEXT: psrlq $16, %xmm4 ; SSE2-NEXT: movd %xmm4, %eax ; SSE2-NEXT: shrl $16, %eax ; SSE2-NEXT: testb $1, %al Index: llvm/trunk/test/CodeGen/X86/shrink_vmul-widen.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/shrink_vmul-widen.ll +++ llvm/trunk/test/CodeGen/X86/shrink_vmul-widen.ll @@ -1175,7 +1175,6 @@ ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi @@ -1210,7 +1209,6 @@ ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) ; X64-SSE-NEXT: retq @@ -1965,7 +1963,6 @@ ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl @@ -1992,7 +1989,6 @@ ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq @@ -2037,7 +2033,6 @@ ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl @@ -2064,7 +2059,6 @@ ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq Index: llvm/trunk/test/CodeGen/X86/vec_extract-sse4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_extract-sse4.ll +++ llvm/trunk/test/CodeGen/X86/vec_extract-sse4.ll @@ -25,17 +25,13 @@ define float @t2(<4 x float>* %P1) nounwind { ; X32-LABEL: t2: ; X32: # %bb.0: -; X32-NEXT: pushl %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movss %xmm0, (%esp) -; X32-NEXT: flds (%esp) -; X32-NEXT: popl %eax +; X32-NEXT: flds 8(%eax) ; X32-NEXT: retl ; ; X64-LABEL: t2: ; X64: # %bb.0: -; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: retq %X = load <4 x float>, <4 x float>* %P1 %tmp = extractelement <4 x float> %X, i32 2 Index: llvm/trunk/test/CodeGen/X86/vector-reduce-mul-widen.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-mul-widen.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-mul-widen.ll @@ -1863,8 +1863,6 @@ ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper Index: llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll +++ llvm/trunk/test/CodeGen/X86/vector-reduce-mul.ll @@ -808,7 +808,6 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovd %xmm0, %eax ; AVX512DQ-NEXT: vzeroupper @@ -817,7 +816,6 @@ ; AVX512DQVL-LABEL: test_v2i32: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax ; AVX512DQVL-NEXT: retq @@ -1178,7 +1176,6 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovd %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax @@ -1188,7 +1185,6 @@ ; AVX512DQVL-LABEL: test_v2i16: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax @@ -1210,7 +1206,7 @@ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: pextrw $0, %xmm1, %eax ; SSE2-NEXT: # kill: def $ax killed $ax killed $eax ; SSE2-NEXT: retq ; @@ -1658,7 +1654,6 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax @@ -1668,7 +1663,6 @@ ; AVX512DQVL-LABEL: test_v2i8: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax ; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax @@ -1899,8 +1893,6 @@ ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpextrb $0, %xmm0, %eax ; AVX2-NEXT: # kill: def $al killed $al killed $eax ; AVX2-NEXT: vzeroupper